simgear/misc/strutils.cxx

   1 // String utilities.
   2 //
   3 // Written by Bernie Bright, started 1998
   4 //
   5 // Copyright (C) 1998  Bernie Bright - bbright@bigpond.net.au
   6 //
   7 // This library is free software; you can redistribute it and/or
   8 // modify it under the terms of the GNU Library General Public
   9 // License as published by the Free Software Foundation; either
  10 // version 2 of the License, or (at your option) any later version.
  11 //
  12 // This library is distributed in the hope that it will be useful,
  13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 // Library General Public License for more details.
  16 //
  17 // You should have received a copy of the GNU General Public License
  18 // along with this program; if not, write to the Free Software
  19 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20 //
  21 // $Id$
  22
  23 #include <ctype.h>
  24 #include <cstring>
  25 #include <sstream>
  26 #include <algorithm>
  27 #include <string.h>             // strerror_r() and strerror_s()
  28 #include <errno.h>
  29
  30 #include "strutils.hxx"
  31
  32 #include <simgear/debug/logstream.hxx>
  33 #include <simgear/package/md5.h>
  34 #include <simgear/compiler.h>   // SG_WINDOWS
  35
  36 using std::string;
  37 using std::vector;
  38 using std::stringstream;
  39
  40 namespace simgear {
  41     namespace strutils {
  42
  43         /*
  44          * utf8ToLatin1() convert utf8 to latin, useful for accent character (i.e éâàîè...)
  45          */
  46         template <typename Iterator> size_t get_length (Iterator p) {
  47                 unsigned char c = static_cast<unsigned char> (*p);
  48                 if (c < 0x80) return 1;
  49                 else if (!(c & 0x20)) return 2;
  50                 else if (!(c & 0x10)) return 3;
  51                 else if (!(c & 0x08)) return 4;
  52                 else if (!(c & 0x04)) return 5;
  53                 else return 6;
  54         }
  55
  56         typedef unsigned int value_type;
  57         template <typename Iterator> value_type get_value (Iterator p) {
  58                 size_t len = get_length (p);
  59                 if (len == 1) return *p;
  60                 value_type res = static_cast<unsigned char> ( *p & (0xff >> (len + 1))) << ((len - 1) * 6 );
  61                 for (--len; len; --len) {
  62                         value_type next_byte = static_cast<unsigned char> (*(++p)) - 0x80;
  63                         if (next_byte & 0xC0) return 0x00ffffff; // invalid UTF-8
  64                         res |= next_byte << ((len - 1) * 6);
  65                         }
  66                 return res;
  67         }
  68
  69         string utf8ToLatin1( string& s_utf8 ) {
  70                 string s_latin1;
  71                 for (string::iterator p = s_utf8.begin(); p != s_utf8.end(); ++p) {
  72                         value_type value = get_value<string::iterator&>(p);
  73                         if (value > 0x10ffff) return s_utf8; // invalid UTF-8: guess that the input was already Latin-1
  74                         if (value > 0xff) SG_LOG(SG_IO, SG_WARN, "utf8ToLatin1: wrong char value: " << value);
  75                         s_latin1 += static_cast<char>(value);
  76                 }
  77                 return s_latin1;
  78         }
  79
  80         /**
  81          *
  82          */
  83         static vector<string>
  84         split_whitespace( const string& str, int maxsplit )
  85         {
  86             vector<string> result;
  87             string::size_type len = str.length();
  88             string::size_type i = 0;
  89             string::size_type j;
  90             int countsplit = 0;
  91
  92             while (i < len)
  93             {
  94                 while (i < len && isspace((unsigned char)str[i]))
  95                 {
  96                     ++i;
  97                 }
  98
  99                 j = i;
 100
 101                 while (i < len && !isspace((unsigned char)str[i]))
 102                 {
 103                     ++i;
 104                 }
 105
 106                 if (j < i)
 107                 {
 108                     result.push_back( str.substr(j, i-j) );
 109                     ++countsplit;
 110                     while (i < len && isspace((unsigned char)str[i]))
 111                     {
 112                         ++i;
 113                     }
 114
 115                     if (maxsplit && (countsplit >= maxsplit) && i < len)
 116                     {
 117                         result.push_back( str.substr( i, len-i ) );
 118                         i = len;
 119                     }
 120                 }
 121             }
 122
 123             return result;
 124         }
 125
 126         /**
 127          *
 128          */
 129         vector<string>
 130         split( const string& str, const char* sep, int maxsplit )
 131         {
 132             if (sep == 0)
 133                 return split_whitespace( str, maxsplit );
 134
 135             vector<string> result;
 136             int n = std::strlen( sep );
 137             if (n == 0)
 138             {
 139                 // Error: empty separator string
 140                 return result;
 141             }
 142             const char* s = str.c_str();
 143             string::size_type len = str.length();
 144             string::size_type i = 0;
 145             string::size_type j = 0;
 146             int splitcount = 0;
 147
 148             while (i+n <= len)
 149             {
 150                 if (s[i] == sep[0] && (n == 1 || std::memcmp(s+i, sep, n) == 0))
 151                 {
 152                     result.push_back( str.substr(j,i-j) );
 153                     i = j = i + n;
 154                     ++splitcount;
 155                     if (maxsplit && (splitcount >= maxsplit))
 156                         break;
 157                 }
 158                 else
 159                 {
 160                     ++i;
 161                 }
 162             }
 163
 164             result.push_back( str.substr(j,len-j) );
 165             return result;
 166         }
 167
 168         /**
 169          * The lstrip(), rstrip() and strip() functions are implemented
 170          * in do_strip() which uses an additional parameter to indicate what
 171          * type of strip should occur.
 172          */
 173         const int LEFTSTRIP = 0;
 174         const int RIGHTSTRIP = 1;
 175         const int BOTHSTRIP = 2;
 176
 177         static string
 178         do_strip( const string& s, int striptype )
 179         {
 180             string::size_type len = s.length();
 181             if( len == 0 ) // empty string is trivial
 182                 return s;
 183             string::size_type i = 0;
 184             if (striptype != RIGHTSTRIP)
 185             {
 186                 while (i < len && isspace(s[i]))
 187                 {
 188                     ++i;
 189                 }
 190             }
 191
 192             string::size_type j = len;
 193             if (striptype != LEFTSTRIP)
 194             {
 195                 do
 196                 {
 197                     --j;
 198                 }
 199                 while (j >= 1 && isspace(s[j]));
 200                 ++j;
 201             }
 202
 203             if (i == 0 && j == len)
 204             {
 205                 return s;
 206             }
 207             else
 208             {
 209                 return s.substr( i, j - i );
 210             }
 211         }
 212
 213         string
 214         lstrip( const string& s )
 215         {
 216             return do_strip( s, LEFTSTRIP );
 217         }
 218
 219         string
 220         rstrip( const string& s )
 221         {
 222             return do_strip( s, RIGHTSTRIP );
 223         }
 224
 225         string
 226         strip( const string& s )
 227         {
 228             return do_strip( s, BOTHSTRIP );
 229         }
 230
 231         string
 232         rpad( const string & s, string::size_type length, char c )
 233         {
 234             string::size_type l = s.length();
 235             if( l >= length ) return s;
 236             string reply = s;
 237             return reply.append( length-l, c );
 238         }
 239
 240         string
 241         lpad( const string & s, size_t length, char c )
 242         {
 243             string::size_type l = s.length();
 244             if( l >= length ) return s;
 245             string reply = s;
 246             return reply.insert( 0, length-l, c );
 247         }
 248
 249         bool
 250         starts_with( const string & s, const string & substr )
 251         {
 252           return s.compare(0, substr.length(), substr) == 0;
 253         }
 254
 255         bool
 256         ends_with( const string & s, const string & substr )
 257         {
 258           if( substr.length() > s.length() )
 259             return false;
 260           return s.compare( s.length() - substr.length(),
 261                             substr.length(),
 262                             substr ) == 0;
 263         }
 264
 265     string simplify(const string& s)
 266     {
 267         string result; // reserve size of 's'?
 268         string::const_iterator it = s.begin(),
 269             end = s.end();
 270
 271     // advance to first non-space char - simplifes logic in main loop,
 272     // since we can always prepend a single space when we see a
 273     // space -> non-space transition
 274         for (; (it != end) && isspace(*it); ++it) { /* nothing */ }
 275
 276         bool lastWasSpace = false;
 277         for (; it != end; ++it) {
 278             char c = *it;
 279             if (isspace(c)) {
 280                 lastWasSpace = true;
 281                 continue;
 282             }
 283
 284             if (lastWasSpace) {
 285                 result.push_back(' ');
 286             }
 287
 288             lastWasSpace = false;
 289             result.push_back(c);
 290         }
 291
 292         return result;
 293     }
 294
 295     int to_int(const std::string& s, int base)
 296     {
 297         stringstream ss(s);
 298         switch (base) {
 299         case 8:      ss >> std::oct; break;
 300         case 16:     ss >> std::hex; break;
 301         default: break;
 302         }
 303
 304         int result;
 305         ss >> result;
 306         return result;
 307     }
 308
 309     int compare_versions(const string& v1, const string& v2)
 310     {
 311         vector<string> v1parts(split(v1, "."));
 312         vector<string> v2parts(split(v2, "."));
 313
 314         int lastPart = std::min(v1parts.size(), v2parts.size());
 315         for (int part=0; part < lastPart; ++part) {
 316             int part1 = to_int(v1parts[part]);
 317             int part2 = to_int(v2parts[part]);
 318
 319             if (part1 != part2) {
 320                 return part1 - part2;
 321             }
 322         } // of parts iteration
 323
 324         // reached end - longer wins
 325         return v1parts.size() - v2parts.size();
 326     }
 327
 328     string join(const string_list& l, const string& joinWith)
 329     {
 330         string result;
 331         unsigned int count = l.size();
 332         for (unsigned int i=0; i < count; ++i) {
 333             result += l[i];
 334             if (i < (count - 1)) {
 335                 result += joinWith;
 336             }
 337         }
 338
 339         return result;
 340     }
 341
 342     string uppercase(const string &s) {
 343       string rslt(s);
 344       for(string::iterator p = rslt.begin(); p != rslt.end(); p++){
 345         *p = toupper(*p);
 346       }
 347       return rslt;
 348     }
 349
 350     string lowercase(const string &s) {
 351       string rslt(s);
 352       for(string::iterator p = rslt.begin(); p != rslt.end(); p++){
 353         *p = tolower(*p);
 354       }
 355       return rslt;
 356     }
 357
 358     void lowercase(string &s) {
 359       for(string::iterator p = s.begin(); p != s.end(); p++){
 360         *p = tolower(*p);
 361       }
 362     }
 363
 364 #if defined(SG_WINDOWS)
 365
 366 #include <windows.h>
 367
 368 static WCharVec convertMultiByteToWString(DWORD encoding, const std::string& a)
 369 {
 370     WCharVec result;
 371     DWORD flags = 0;
 372     int requiredWideChars = MultiByteToWideChar(encoding, flags,
 373                         a.c_str(), a.size(),
 374                         NULL, 0);
 375     result.resize(requiredWideChars);
 376     MultiByteToWideChar(encoding, flags, a.c_str(), a.size(),
 377                         result.data(), result.size());
 378     return result;
 379 }
 380
 381 WCharVec convertUtf8ToWString(const std::string& a)
 382 {
 383     return convertMultiByteToWString(CP_UTF8, a);
 384 }
 385
 386 #endif
 387
 388 std::string convertWindowsLocal8BitToUtf8(const std::string& a)
 389 {
 390 #ifdef SG_WINDOWS
 391     DWORD flags = 0;
 392     WCharVec wideString = convertMultiByteToWString(CP_ACP, a);
 393
 394     // convert down to UTF-8
 395     std::vector<char> result;
 396     int requiredUTF8Chars = WideCharToMultiByte(CP_UTF8, flags,
 397                                                 wideString.data(), wideString.size(),
 398                                                 NULL, 0, NULL, NULL);
 399     result.resize(requiredUTF8Chars);
 400     WideCharToMultiByte(CP_UTF8, flags,
 401                         wideString.data(), wideString.size(),
 402                         result.data(), result.size(), NULL, NULL);
 403     return std::string(result.data(), result.size());
 404 #else
 405     return a;
 406 #endif
 407 }
 408
 409 //------------------------------------------------------------------------------
 410 std::string md5(const unsigned char* data, size_t num)
 411 {
 412   SG_MD5_CTX md5_ctx;
 413   SG_MD5Init(&md5_ctx);
 414   SG_MD5Update(&md5_ctx, data, num);
 415
 416   unsigned char digest[MD5_DIGEST_LENGTH];
 417   SG_MD5Final(digest, &md5_ctx);
 418
 419   return encodeHex(digest, MD5_DIGEST_LENGTH);
 420 }
 421
 422 //------------------------------------------------------------------------------
 423 std::string md5(const char* data, size_t num)
 424 {
 425   return md5(reinterpret_cast<const unsigned char*>(data), num);
 426 }
 427
 428 //------------------------------------------------------------------------------
 429 std::string md5(const std::string& str)
 430 {
 431   return md5(reinterpret_cast<const unsigned char*>(str.c_str()), str.size());
 432 }
 433
 434 //------------------------------------------------------------------------------
 435 static const std::string base64_chars =
 436 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 437 "abcdefghijklmnopqrstuvwxyz"
 438 "0123456789+/";
 439
 440 static const unsigned char base64_decode_map[128] =
 441 {
 442     127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
 443     127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
 444     127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
 445     127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
 446     127, 127, 127,  62, 127, 127, 127,  63,  52,  53,
 447     54,  55,  56,  57,  58,  59,  60,  61, 127, 127,
 448     127,  64, 127, 127, 127,   0,   1,   2,   3,   4,
 449     5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
 450     15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
 451     25, 127, 127, 127, 127, 127, 127,  26,  27,  28,
 452     29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
 453     39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
 454     49,  50,  51, 127, 127, 127, 127, 127
 455 };
 456
 457
 458 static inline bool is_base64(unsigned char c) {
 459   return (isalnum(c) || (c == '+') || (c == '/'));
 460 }
 461
 462 static bool is_whitespace(unsigned char c) {
 463     return ((c == ' ') || (c == '\r') || (c == '\n'));
 464 }
 465
 466 void decodeBase64(const std::string& encoded_string, std::vector<unsigned char>& ret)
 467 {
 468   int in_len = encoded_string.size();
 469   int i = 0;
 470   int j = 0;
 471   int in_ = 0;
 472   unsigned char char_array_4[4], char_array_3[3];
 473
 474   while (in_len-- && ( encoded_string[in_] != '=')) {
 475     if (is_whitespace( encoded_string[in_])) {
 476         in_++;
 477         continue;
 478     }
 479
 480     if (!is_base64(encoded_string[in_])) {
 481         break;
 482     }
 483
 484     char_array_4[i++] = encoded_string[in_]; in_++;
 485     if (i ==4) {
 486       for (i = 0; i <4; i++)
 487         char_array_4[i] = base64_decode_map[char_array_4[i]];
 488
 489       char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
 490       char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
 491       char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
 492
 493       for (i = 0; (i < 3); i++)
 494         ret.push_back(char_array_3[i]);
 495       i = 0;
 496     }
 497   }
 498
 499   if (i) {
 500     for (j = i; j <4; j++)
 501       char_array_4[j] = 0;
 502
 503     for (j = 0; j <4; j++)
 504       char_array_4[j] = base64_decode_map[char_array_4[j]];
 505
 506     char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
 507     char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
 508     char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
 509
 510     for (j = 0; (j < i - 1); j++) ret.push_back(char_array_3[j]);
 511   }
 512 }
 513
 514 //------------------------------------------------------------------------------
 515 const char hexChar[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
 516
 517 std::string encodeHex(const std::string& bytes)
 518 {
 519   return encodeHex(
 520     reinterpret_cast<const unsigned char*>(bytes.c_str()),
 521     bytes.size()
 522   );
 523 }
 524
 525 std::string encodeHex(const unsigned char* rawBytes, unsigned int length)
 526 {
 527   std::string hex(length * 2, '\0');
 528   for (unsigned int i=0; i<length;++i) {
 529       unsigned char c = *rawBytes++;
 530       hex[i * 2] = hexChar[c >> 4];
 531       hex[i * 2 + 1] = hexChar[c & 0x0f];
 532   }
 533
 534   return hex;
 535 }
 536
 537 //------------------------------------------------------------------------------
 538 std::string unescape(const char* s)
 539 {
 540   std::string r;
 541   while( *s )
 542   {
 543     if( *s != '\\' )
 544     {
 545       r += *s++;
 546       continue;
 547     }
 548
 549     if( !*++s )
 550       break;
 551
 552     if (*s == '\\') {
 553         r += '\\';
 554     } else if (*s == 'n') {
 555         r += '\n';
 556     } else if (*s == 'r') {
 557         r += '\r';
 558     } else if (*s == 't') {
 559         r += '\t';
 560     } else if (*s == 'v') {
 561         r += '\v';
 562     } else if (*s == 'f') {
 563         r += '\f';
 564     } else if (*s == 'a') {
 565         r += '\a';
 566     } else if (*s == 'b') {
 567         r += '\b';
 568     } else if (*s == 'x') {
 569         if (!*++s)
 570             break;
 571         int v = 0;
 572         for (int i = 0; i < 2 && isxdigit(*s); i++, s++)
 573             v = v * 16 + (isdigit(*s) ? *s - '0' : 10 + tolower(*s) - 'a');
 574         r += v;
 575         continue;
 576
 577     } else if (*s >= '0' && *s <= '7') {
 578         int v = *s++ - '0';
 579         for (int i = 0; i < 3 && *s >= '0' && *s <= '7'; i++, s++)
 580             v = v * 8 + *s - '0';
 581         r += v;
 582         continue;
 583
 584     } else {
 585         r += *s;
 586     }
 587     s++;
 588   }
 589   return r;
 590 }
 591
 592 string sanitizePrintfFormat(const string& input)
 593 {
 594     string::size_type i = input.find("%n");
 595     if (i != string::npos) {
 596         SG_LOG(SG_IO, SG_WARN, "sanitizePrintfFormat: bad format string:" << input);
 597         return string();
 598     }
 599
 600     return input;
 601 }
 602
 603 std::string error_string(int errnum)
 604 {
 605   char buf[512];                // somewhat arbitrary...
 606   // This could be simplified with C11 (annex K, optional...), which offers:
 607   //
 608   //   errno_t strerror_s( char *buf, rsize_t bufsz, errno_t errnum );
 609   //   size_t strerrorlen_s( errno_t errnum );
 610
 611 #if defined(SG_WINDOWS)
 612   errno_t retcode;
 613   // Always makes the string in 'buf' null-terminated
 614   retcode = strerror_s(buf, sizeof(buf), errnum);
 615 #elif defined(_GNU_SOURCE)
 616   return std::string(strerror_r(errnum, buf, sizeof(buf)));
 617 #elif _POSIX_C_SOURCE >= 200112L
 618   int retcode;
 619   // POSIX.1-2001 and POSIX.1-2008
 620   retcode = strerror_r(errnum, buf, sizeof(buf));
 621 #else
 622 #error "Could not find a thread-safe alternative to strerror()."
 623 #endif
 624
 625 #if !defined(_GNU_SOURCE)
 626   if (retcode) {
 627     std::string msg = "unable to get error message for a given error number";
 628     // C++11 would make this shorter with std::to_string()
 629     std::ostringstream ostr;
 630     ostr << errnum;
 631
 632 #if !defined(SG_WINDOWS)
 633     if (retcode == ERANGE) {    // more specific error message in this case
 634       msg = std::string("buffer too small to hold the error message for "
 635                         "the specified error number");
 636     }
 637 #endif
 638
 639     throw sg_error(msg, ostr.str());
 640   }
 641
 642   return std::string(buf);
 643 #endif  // !defined(_GNU_SOURCE)
 644 }
 645
 646 } // end namespace strutils
 647
 648 } // end namespace simgear