simgear/misc/strutils.cxx

   1 // String utilities.
   2 //
   3 // Written by Bernie Bright, started 1998
   4 //
   5 // Copyright (C) 1998  Bernie Bright - bbright@bigpond.net.au
   6 //
   7 // This library is free software; you can redistribute it and/or
   8 // modify it under the terms of the GNU Library General Public
   9 // License as published by the Free Software Foundation; either
  10 // version 2 of the License, or (at your option) any later version.
  11 //
  12 // This library is distributed in the hope that it will be useful,
  13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 // Library General Public License for more details.
  16 //
  17 // You should have received a copy of the GNU General Public License
  18 // along with this program; if not, write to the Free Software
  19 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20 //
  21 // $Id$
  22
  23 #include <ctype.h>
  24 #include <cstring>
  25 #include <sstream>
  26
  27 #include "strutils.hxx"
  28
  29 #include <simgear/debug/logstream.hxx>
  30 #include <simgear/package/md5.h>
  31
  32 using std::string;
  33 using std::vector;
  34 using std::stringstream;
  35
  36 namespace simgear {
  37     namespace strutils {
  38
  39         /*
  40          * utf8ToLatin1() convert utf8 to latin, useful for accent character (i.e éâàîè...)
  41          */
  42         template <typename Iterator> size_t get_length (Iterator p) {
  43                 unsigned char c = static_cast<unsigned char> (*p);
  44                 if (c < 0x80) return 1;
  45                 else if (!(c & 0x20)) return 2;
  46                 else if (!(c & 0x10)) return 3;
  47                 else if (!(c & 0x08)) return 4;
  48                 else if (!(c & 0x04)) return 5;
  49                 else return 6;
  50         }
  51
  52         typedef unsigned int value_type;
  53         template <typename Iterator> value_type get_value (Iterator p) {
  54                 size_t len = get_length (p);
  55                 if (len == 1) return *p;
  56                 value_type res = static_cast<unsigned char> ( *p & (0xff >> (len + 1))) << ((len - 1) * 6 );
  57                 for (--len; len; --len) {
  58                         value_type next_byte = static_cast<unsigned char> (*(++p)) - 0x80;
  59                         if (next_byte & 0xC0) return 0x00ffffff; // invalid UTF-8
  60                         res |= next_byte << ((len - 1) * 6);
  61                         }
  62                 return res;
  63         }
  64
  65         string utf8ToLatin1( string& s_utf8 ) {
  66                 string s_latin1;
  67                 for (string::iterator p = s_utf8.begin(); p != s_utf8.end(); ++p) {
  68                         value_type value = get_value<string::iterator&>(p);
  69                         if (value > 0x10ffff) return s_utf8; // invalid UTF-8: guess that the input was already Latin-1
  70                         if (value > 0xff) SG_LOG(SG_IO, SG_WARN, "utf8ToLatin1: wrong char value: " << value);
  71                         s_latin1 += static_cast<char>(value);
  72                 }
  73                 return s_latin1;
  74         }
  75
  76         /**
  77          *
  78          */
  79         static vector<string>
  80         split_whitespace( const string& str, int maxsplit )
  81         {
  82             vector<string> result;
  83             string::size_type len = str.length();
  84             string::size_type i = 0;
  85             string::size_type j;
  86             int countsplit = 0;
  87
  88             while (i < len)
  89             {
  90                 while (i < len && isspace((unsigned char)str[i]))
  91                 {
  92                     ++i;
  93                 }
  94
  95                 j = i;
  96
  97                 while (i < len && !isspace((unsigned char)str[i]))
  98                 {
  99                     ++i;
 100                 }
 101
 102                 if (j < i)
 103                 {
 104                     result.push_back( str.substr(j, i-j) );
 105                     ++countsplit;
 106                     while (i < len && isspace((unsigned char)str[i]))
 107                     {
 108                         ++i;
 109                     }
 110
 111                     if (maxsplit && (countsplit >= maxsplit) && i < len)
 112                     {
 113                         result.push_back( str.substr( i, len-i ) );
 114                         i = len;
 115                     }
 116                 }
 117             }
 118
 119             return result;
 120         }
 121
 122         /**
 123          *
 124          */
 125         vector<string>
 126         split( const string& str, const char* sep, int maxsplit )
 127         {
 128             if (sep == 0)
 129                 return split_whitespace( str, maxsplit );
 130
 131             vector<string> result;
 132             int n = std::strlen( sep );
 133             if (n == 0)
 134             {
 135                 // Error: empty separator string
 136                 return result;
 137             }
 138             const char* s = str.c_str();
 139             string::size_type len = str.length();
 140             string::size_type i = 0;
 141             string::size_type j = 0;
 142             int splitcount = 0;
 143
 144             while (i+n <= len)
 145             {
 146                 if (s[i] == sep[0] && (n == 1 || std::memcmp(s+i, sep, n) == 0))
 147                 {
 148                     result.push_back( str.substr(j,i-j) );
 149                     i = j = i + n;
 150                     ++splitcount;
 151                     if (maxsplit && (splitcount >= maxsplit))
 152                         break;
 153                 }
 154                 else
 155                 {
 156                     ++i;
 157                 }
 158             }
 159
 160             result.push_back( str.substr(j,len-j) );
 161             return result;
 162         }
 163
 164         /**
 165          * The lstrip(), rstrip() and strip() functions are implemented
 166          * in do_strip() which uses an additional parameter to indicate what
 167          * type of strip should occur.
 168          */
 169         const int LEFTSTRIP = 0;
 170         const int RIGHTSTRIP = 1;
 171         const int BOTHSTRIP = 2;
 172
 173         static string
 174         do_strip( const string& s, int striptype )
 175         {
 176             string::size_type len = s.length();
 177             if( len == 0 ) // empty string is trivial
 178                 return s;
 179             string::size_type i = 0;
 180             if (striptype != RIGHTSTRIP)
 181             {
 182                 while (i < len && isspace(s[i]))
 183                 {
 184                     ++i;
 185                 }
 186             }
 187
 188             string::size_type j = len;
 189             if (striptype != LEFTSTRIP)
 190             {
 191                 do
 192                 {
 193                     --j;
 194                 }
 195                 while (j >= 1 && isspace(s[j]));
 196                 ++j;
 197             }
 198
 199             if (i == 0 && j == len)
 200             {
 201                 return s;
 202             }
 203             else
 204             {
 205                 return s.substr( i, j - i );
 206             }
 207         }
 208
 209         string
 210         lstrip( const string& s )
 211         {
 212             return do_strip( s, LEFTSTRIP );
 213         }
 214
 215         string
 216         rstrip( const string& s )
 217         {
 218             return do_strip( s, RIGHTSTRIP );
 219         }
 220
 221         string
 222         strip( const string& s )
 223         {
 224             return do_strip( s, BOTHSTRIP );
 225         }
 226
 227         string
 228         rpad( const string & s, string::size_type length, char c )
 229         {
 230             string::size_type l = s.length();
 231             if( l >= length ) return s;
 232             string reply = s;
 233             return reply.append( length-l, c );
 234         }
 235
 236         string
 237         lpad( const string & s, size_t length, char c )
 238         {
 239             string::size_type l = s.length();
 240             if( l >= length ) return s;
 241             string reply = s;
 242             return reply.insert( 0, length-l, c );
 243         }
 244
 245         bool
 246         starts_with( const string & s, const string & substr )
 247         {
 248           return s.compare(0, substr.length(), substr) == 0;
 249         }
 250
 251         bool
 252         ends_with( const string & s, const string & substr )
 253         {
 254           if( substr.length() > s.length() )
 255             return false;
 256           return s.compare( s.length() - substr.length(),
 257                             substr.length(),
 258                             substr ) == 0;
 259         }
 260
 261     string simplify(const string& s)
 262     {
 263         string result; // reserve size of 's'?
 264         string::const_iterator it = s.begin(),
 265             end = s.end();
 266
 267     // advance to first non-space char - simplifes logic in main loop,
 268     // since we can always prepend a single space when we see a
 269     // space -> non-space transition
 270         for (; (it != end) && isspace(*it); ++it) { /* nothing */ }
 271
 272         bool lastWasSpace = false;
 273         for (; it != end; ++it) {
 274             char c = *it;
 275             if (isspace(c)) {
 276                 lastWasSpace = true;
 277                 continue;
 278             }
 279
 280             if (lastWasSpace) {
 281                 result.push_back(' ');
 282             }
 283
 284             lastWasSpace = false;
 285             result.push_back(c);
 286         }
 287
 288         return result;
 289     }
 290
 291     int to_int(const std::string& s, int base)
 292     {
 293         stringstream ss(s);
 294         switch (base) {
 295         case 8:      ss >> std::oct; break;
 296         case 16:     ss >> std::hex; break;
 297         default: break;
 298         }
 299
 300         int result;
 301         ss >> result;
 302         return result;
 303     }
 304
 305     int compare_versions(const string& v1, const string& v2)
 306     {
 307         vector<string> v1parts(split(v1, "."));
 308         vector<string> v2parts(split(v2, "."));
 309
 310         int lastPart = std::min(v1parts.size(), v2parts.size());
 311         for (int part=0; part < lastPart; ++part) {
 312             int part1 = to_int(v1parts[part]);
 313             int part2 = to_int(v2parts[part]);
 314
 315             if (part1 != part2) {
 316                 return part1 - part2;
 317             }
 318         } // of parts iteration
 319
 320         // reached end - longer wins
 321         return v1parts.size() - v2parts.size();
 322     }
 323
 324     string join(const string_list& l, const string& joinWith)
 325     {
 326         string result;
 327         unsigned int count = l.size();
 328         for (unsigned int i=0; i < count; ++i) {
 329             result += l[i];
 330             if (i < (count - 1)) {
 331                 result += joinWith;
 332             }
 333         }
 334
 335         return result;
 336     }
 337
 338     string uppercase(const string &s) {
 339       string rslt(s);
 340       for(string::iterator p = rslt.begin(); p != rslt.end(); p++){
 341         *p = toupper(*p);
 342       }
 343       return rslt;
 344     }
 345
 346     string lowercase(const string &s) {
 347       string rslt(s);
 348       for(string::iterator p = rslt.begin(); p != rslt.end(); p++){
 349         *p = tolower(*p);
 350       }
 351       return rslt;
 352     }
 353
 354     void lowercase(string &s) {
 355       for(string::iterator p = s.begin(); p != s.end(); p++){
 356         *p = tolower(*p);
 357       }
 358     }
 359
 360 #if defined(SG_WINDOWS)
 361
 362 #include <windows.h>
 363
 364 static WCharVec convertMultiByteToWString(DWORD encoding, const std::string& a)
 365 {
 366     WCharVec result;
 367     DWORD flags = 0;
 368     int requiredWideChars = MultiByteToWideChar(encoding, flags,
 369                         a.c_str(), a.size(),
 370                         NULL, 0);
 371     result.resize(requiredWideChars);
 372     MultiByteToWideChar(encoding, flags, a.c_str(), a.size(),
 373                         result.data(), result.size());
 374     return result;
 375 }
 376
 377 WCharVec convertUtf8ToWString(const std::string& a)
 378 {
 379     return convertMultiByteToWString(CP_UTF8, a);
 380 }
 381
 382 #endif
 383
 384 std::string convertWindowsLocal8BitToUtf8(const std::string& a)
 385 {
 386 #ifdef SG_WINDOWS
 387     DWORD flags = 0;
 388     WCharVec wideString = convertMultiByteToWString(CP_ACP, a);
 389
 390     // convert down to UTF-8
 391     std::vector<char> result;
 392     int requiredUTF8Chars = WideCharToMultiByte(CP_UTF8, flags,
 393                                                 wideString.data(), wideString.size(),
 394                                                 NULL, 0, NULL, NULL);
 395     result.resize(requiredUTF8Chars);
 396     WideCharToMultiByte(CP_UTF8, flags,
 397                         wideString.data(), wideString.size(),
 398                         result.data(), result.size(), NULL, NULL);
 399     return std::string(result.data(), result.size());
 400 #else
 401     return a;
 402 #endif
 403 }
 404
 405 //------------------------------------------------------------------------------
 406 std::string md5(const unsigned char* data, size_t num)
 407 {
 408   SG_MD5_CTX md5_ctx;
 409   SG_MD5Init(&md5_ctx);
 410   SG_MD5Update(&md5_ctx, data, num);
 411
 412   unsigned char digest[MD5_DIGEST_LENGTH];
 413   SG_MD5Final(digest, &md5_ctx);
 414
 415   return encodeHex(digest, MD5_DIGEST_LENGTH);
 416 }
 417
 418 //------------------------------------------------------------------------------
 419 std::string md5(const char* data, size_t num)
 420 {
 421   return md5(reinterpret_cast<const unsigned char*>(data), num);
 422 }
 423
 424 //------------------------------------------------------------------------------
 425 std::string md5(const std::string& str)
 426 {
 427   return md5(reinterpret_cast<const unsigned char*>(str.c_str()), str.size());
 428 }
 429
 430 //------------------------------------------------------------------------------
 431 static const std::string base64_chars =
 432 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 433 "abcdefghijklmnopqrstuvwxyz"
 434 "0123456789+/";
 435
 436 static const unsigned char base64_decode_map[128] =
 437 {
 438     127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
 439     127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
 440     127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
 441     127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
 442     127, 127, 127,  62, 127, 127, 127,  63,  52,  53,
 443     54,  55,  56,  57,  58,  59,  60,  61, 127, 127,
 444     127,  64, 127, 127, 127,   0,   1,   2,   3,   4,
 445     5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
 446     15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
 447     25, 127, 127, 127, 127, 127, 127,  26,  27,  28,
 448     29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
 449     39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
 450     49,  50,  51, 127, 127, 127, 127, 127
 451 };
 452
 453
 454 static inline bool is_base64(unsigned char c) {
 455   return (isalnum(c) || (c == '+') || (c == '/'));
 456 }
 457
 458 static bool is_whitespace(unsigned char c) {
 459     return ((c == ' ') || (c == '\r') || (c == '\n'));
 460 }
 461
 462 void decodeBase64(const std::string& encoded_string, std::vector<unsigned char>& ret)
 463 {
 464   int in_len = encoded_string.size();
 465   int i = 0;
 466   int j = 0;
 467   int in_ = 0;
 468   unsigned char char_array_4[4], char_array_3[3];
 469
 470   while (in_len-- && ( encoded_string[in_] != '=')) {
 471     if (is_whitespace( encoded_string[in_])) {
 472         in_++;
 473         continue;
 474     }
 475
 476     if (!is_base64(encoded_string[in_])) {
 477         break;
 478     }
 479
 480     char_array_4[i++] = encoded_string[in_]; in_++;
 481     if (i ==4) {
 482       for (i = 0; i <4; i++)
 483         char_array_4[i] = base64_decode_map[char_array_4[i]];
 484
 485       char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
 486       char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
 487       char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
 488
 489       for (i = 0; (i < 3); i++)
 490         ret.push_back(char_array_3[i]);
 491       i = 0;
 492     }
 493   }
 494
 495   if (i) {
 496     for (j = i; j <4; j++)
 497       char_array_4[j] = 0;
 498
 499     for (j = 0; j <4; j++)
 500       char_array_4[j] = base64_decode_map[char_array_4[j]];
 501
 502     char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
 503     char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
 504     char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
 505
 506     for (j = 0; (j < i - 1); j++) ret.push_back(char_array_3[j]);
 507   }
 508 }
 509
 510 //------------------------------------------------------------------------------
 511 const char hexChar[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
 512
 513 std::string encodeHex(const std::string& bytes)
 514 {
 515   return encodeHex(
 516     reinterpret_cast<const unsigned char*>(bytes.c_str()),
 517     bytes.size()
 518   );
 519 }
 520
 521 std::string encodeHex(const unsigned char* rawBytes, unsigned int length)
 522 {
 523   std::string hex(length * 2, '\0');
 524   for (unsigned int i=0; i<length;++i) {
 525       unsigned char c = *rawBytes++;
 526       hex[i * 2] = hexChar[c >> 4];
 527       hex[i * 2 + 1] = hexChar[c & 0x0f];
 528   }
 529
 530   return hex;
 531 }
 532
 533 //------------------------------------------------------------------------------
 534 std::string unescape(const char* s)
 535 {
 536   std::string r;
 537   while( *s )
 538   {
 539     if( *s != '\\' )
 540     {
 541       r += *s++;
 542       continue;
 543     }
 544
 545     if( !*++s )
 546       break;
 547
 548     if (*s == '\\') {
 549         r += '\\';
 550     } else if (*s == 'n') {
 551         r += '\n';
 552     } else if (*s == 'r') {
 553         r += '\r';
 554     } else if (*s == 't') {
 555         r += '\t';
 556     } else if (*s == 'v') {
 557         r += '\v';
 558     } else if (*s == 'f') {
 559         r += '\f';
 560     } else if (*s == 'a') {
 561         r += '\a';
 562     } else if (*s == 'b') {
 563         r += '\b';
 564     } else if (*s == 'x') {
 565         if (!*++s)
 566             break;
 567         int v = 0;
 568         for (int i = 0; i < 2 && isxdigit(*s); i++, s++)
 569             v = v * 16 + (isdigit(*s) ? *s - '0' : 10 + tolower(*s) - 'a');
 570         r += v;
 571         continue;
 572
 573     } else if (*s >= '0' && *s <= '7') {
 574         int v = *s++ - '0';
 575         for (int i = 0; i < 3 && *s >= '0' && *s <= '7'; i++, s++)
 576             v = v * 8 + *s - '0';
 577         r += v;
 578         continue;
 579
 580     } else {
 581         r += *s;
 582     }
 583     s++;
 584   }
 585   return r;
 586 }
 587
 588 string sanitizePrintfFormat(const string& input)
 589 {
 590     string::size_type i = input.find("%n");
 591     if (i != string::npos) {
 592         SG_LOG(SG_IO, SG_WARN, "sanitizePrintfFormat: bad format string:" << input);
 593         return string();
 594     }
 595
 596     return input;
 597 }
 598
 599 } // end namespace strutils
 600
 601 } // end namespace simgear