simgear/misc/strutils.cxx

   1 // String utilities.
   2 //
   3 // Written by Bernie Bright, started 1998
   4 //
   5 // Copyright (C) 1998  Bernie Bright - bbright@bigpond.net.au
   6 //
   7 // This library is free software; you can redistribute it and/or
   8 // modify it under the terms of the GNU Library General Public
   9 // License as published by the Free Software Foundation; either
  10 // version 2 of the License, or (at your option) any later version.
  11 //
  12 // This library is distributed in the hope that it will be useful,
  13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 // Library General Public License for more details.
  16 //
  17 // You should have received a copy of the GNU General Public License
  18 // along with this program; if not, write to the Free Software
  19 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20 //
  21 // $Id$
  22
  23 #include <ctype.h>
  24 #include <cstring>
  25 #include <sstream>
  26 #include <algorithm>
  27
  28 #include "strutils.hxx"
  29
  30 #include <simgear/debug/logstream.hxx>
  31 #include <simgear/package/md5.h>
  32
  33 using std::string;
  34 using std::vector;
  35 using std::stringstream;
  36
  37 namespace simgear {
  38     namespace strutils {
  39
  40         /*
  41          * utf8ToLatin1() convert utf8 to latin, useful for accent character (i.e éâàîè...)
  42          */
  43         template <typename Iterator> size_t get_length (Iterator p) {
  44                 unsigned char c = static_cast<unsigned char> (*p);
  45                 if (c < 0x80) return 1;
  46                 else if (!(c & 0x20)) return 2;
  47                 else if (!(c & 0x10)) return 3;
  48                 else if (!(c & 0x08)) return 4;
  49                 else if (!(c & 0x04)) return 5;
  50                 else return 6;
  51         }
  52
  53         typedef unsigned int value_type;
  54         template <typename Iterator> value_type get_value (Iterator p) {
  55                 size_t len = get_length (p);
  56                 if (len == 1) return *p;
  57                 value_type res = static_cast<unsigned char> ( *p & (0xff >> (len + 1))) << ((len - 1) * 6 );
  58                 for (--len; len; --len) {
  59                         value_type next_byte = static_cast<unsigned char> (*(++p)) - 0x80;
  60                         if (next_byte & 0xC0) return 0x00ffffff; // invalid UTF-8
  61                         res |= next_byte << ((len - 1) * 6);
  62                         }
  63                 return res;
  64         }
  65
  66         string utf8ToLatin1( string& s_utf8 ) {
  67                 string s_latin1;
  68                 for (string::iterator p = s_utf8.begin(); p != s_utf8.end(); ++p) {
  69                         value_type value = get_value<string::iterator&>(p);
  70                         if (value > 0x10ffff) return s_utf8; // invalid UTF-8: guess that the input was already Latin-1
  71                         if (value > 0xff) SG_LOG(SG_IO, SG_WARN, "utf8ToLatin1: wrong char value: " << value);
  72                         s_latin1 += static_cast<char>(value);
  73                 }
  74                 return s_latin1;
  75         }
  76
  77         /**
  78          *
  79          */
  80         static vector<string>
  81         split_whitespace( const string& str, int maxsplit )
  82         {
  83             vector<string> result;
  84             string::size_type len = str.length();
  85             string::size_type i = 0;
  86             string::size_type j;
  87             int countsplit = 0;
  88
  89             while (i < len)
  90             {
  91                 while (i < len && isspace((unsigned char)str[i]))
  92                 {
  93                     ++i;
  94                 }
  95
  96                 j = i;
  97
  98                 while (i < len && !isspace((unsigned char)str[i]))
  99                 {
 100                     ++i;
 101                 }
 102
 103                 if (j < i)
 104                 {
 105                     result.push_back( str.substr(j, i-j) );
 106                     ++countsplit;
 107                     while (i < len && isspace((unsigned char)str[i]))
 108                     {
 109                         ++i;
 110                     }
 111
 112                     if (maxsplit && (countsplit >= maxsplit) && i < len)
 113                     {
 114                         result.push_back( str.substr( i, len-i ) );
 115                         i = len;
 116                     }
 117                 }
 118             }
 119
 120             return result;
 121         }
 122
 123         /**
 124          *
 125          */
 126         vector<string>
 127         split( const string& str, const char* sep, int maxsplit )
 128         {
 129             if (sep == 0)
 130                 return split_whitespace( str, maxsplit );
 131
 132             vector<string> result;
 133             int n = std::strlen( sep );
 134             if (n == 0)
 135             {
 136                 // Error: empty separator string
 137                 return result;
 138             }
 139             const char* s = str.c_str();
 140             string::size_type len = str.length();
 141             string::size_type i = 0;
 142             string::size_type j = 0;
 143             int splitcount = 0;
 144
 145             while (i+n <= len)
 146             {
 147                 if (s[i] == sep[0] && (n == 1 || std::memcmp(s+i, sep, n) == 0))
 148                 {
 149                     result.push_back( str.substr(j,i-j) );
 150                     i = j = i + n;
 151                     ++splitcount;
 152                     if (maxsplit && (splitcount >= maxsplit))
 153                         break;
 154                 }
 155                 else
 156                 {
 157                     ++i;
 158                 }
 159             }
 160
 161             result.push_back( str.substr(j,len-j) );
 162             return result;
 163         }
 164
 165         /**
 166          * The lstrip(), rstrip() and strip() functions are implemented
 167          * in do_strip() which uses an additional parameter to indicate what
 168          * type of strip should occur.
 169          */
 170         const int LEFTSTRIP = 0;
 171         const int RIGHTSTRIP = 1;
 172         const int BOTHSTRIP = 2;
 173
 174         static string
 175         do_strip( const string& s, int striptype )
 176         {
 177             string::size_type len = s.length();
 178             if( len == 0 ) // empty string is trivial
 179                 return s;
 180             string::size_type i = 0;
 181             if (striptype != RIGHTSTRIP)
 182             {
 183                 while (i < len && isspace(s[i]))
 184                 {
 185                     ++i;
 186                 }
 187             }
 188
 189             string::size_type j = len;
 190             if (striptype != LEFTSTRIP)
 191             {
 192                 do
 193                 {
 194                     --j;
 195                 }
 196                 while (j >= 1 && isspace(s[j]));
 197                 ++j;
 198             }
 199
 200             if (i == 0 && j == len)
 201             {
 202                 return s;
 203             }
 204             else
 205             {
 206                 return s.substr( i, j - i );
 207             }
 208         }
 209
 210         string
 211         lstrip( const string& s )
 212         {
 213             return do_strip( s, LEFTSTRIP );
 214         }
 215
 216         string
 217         rstrip( const string& s )
 218         {
 219             return do_strip( s, RIGHTSTRIP );
 220         }
 221
 222         string
 223         strip( const string& s )
 224         {
 225             return do_strip( s, BOTHSTRIP );
 226         }
 227
 228         string
 229         rpad( const string & s, string::size_type length, char c )
 230         {
 231             string::size_type l = s.length();
 232             if( l >= length ) return s;
 233             string reply = s;
 234             return reply.append( length-l, c );
 235         }
 236
 237         string
 238         lpad( const string & s, size_t length, char c )
 239         {
 240             string::size_type l = s.length();
 241             if( l >= length ) return s;
 242             string reply = s;
 243             return reply.insert( 0, length-l, c );
 244         }
 245
 246         bool
 247         starts_with( const string & s, const string & substr )
 248         {
 249           return s.compare(0, substr.length(), substr) == 0;
 250         }
 251
 252         bool
 253         ends_with( const string & s, const string & substr )
 254         {
 255           if( substr.length() > s.length() )
 256             return false;
 257           return s.compare( s.length() - substr.length(),
 258                             substr.length(),
 259                             substr ) == 0;
 260         }
 261
 262     string simplify(const string& s)
 263     {
 264         string result; // reserve size of 's'?
 265         string::const_iterator it = s.begin(),
 266             end = s.end();
 267
 268     // advance to first non-space char - simplifes logic in main loop,
 269     // since we can always prepend a single space when we see a
 270     // space -> non-space transition
 271         for (; (it != end) && isspace(*it); ++it) { /* nothing */ }
 272
 273         bool lastWasSpace = false;
 274         for (; it != end; ++it) {
 275             char c = *it;
 276             if (isspace(c)) {
 277                 lastWasSpace = true;
 278                 continue;
 279             }
 280
 281             if (lastWasSpace) {
 282                 result.push_back(' ');
 283             }
 284
 285             lastWasSpace = false;
 286             result.push_back(c);
 287         }
 288
 289         return result;
 290     }
 291
 292     int to_int(const std::string& s, int base)
 293     {
 294         stringstream ss(s);
 295         switch (base) {
 296         case 8:      ss >> std::oct; break;
 297         case 16:     ss >> std::hex; break;
 298         default: break;
 299         }
 300
 301         int result;
 302         ss >> result;
 303         return result;
 304     }
 305
 306     int compare_versions(const string& v1, const string& v2)
 307     {
 308         vector<string> v1parts(split(v1, "."));
 309         vector<string> v2parts(split(v2, "."));
 310
 311         int lastPart = std::min(v1parts.size(), v2parts.size());
 312         for (int part=0; part < lastPart; ++part) {
 313             int part1 = to_int(v1parts[part]);
 314             int part2 = to_int(v2parts[part]);
 315
 316             if (part1 != part2) {
 317                 return part1 - part2;
 318             }
 319         } // of parts iteration
 320
 321         // reached end - longer wins
 322         return v1parts.size() - v2parts.size();
 323     }
 324
 325     string join(const string_list& l, const string& joinWith)
 326     {
 327         string result;
 328         unsigned int count = l.size();
 329         for (unsigned int i=0; i < count; ++i) {
 330             result += l[i];
 331             if (i < (count - 1)) {
 332                 result += joinWith;
 333             }
 334         }
 335
 336         return result;
 337     }
 338
 339     string uppercase(const string &s) {
 340       string rslt(s);
 341       for(string::iterator p = rslt.begin(); p != rslt.end(); p++){
 342         *p = toupper(*p);
 343       }
 344       return rslt;
 345     }
 346
 347     string lowercase(const string &s) {
 348       string rslt(s);
 349       for(string::iterator p = rslt.begin(); p != rslt.end(); p++){
 350         *p = tolower(*p);
 351       }
 352       return rslt;
 353     }
 354
 355     void lowercase(string &s) {
 356       for(string::iterator p = s.begin(); p != s.end(); p++){
 357         *p = tolower(*p);
 358       }
 359     }
 360
 361 #if defined(SG_WINDOWS)
 362
 363 #include <windows.h>
 364
 365 static WCharVec convertMultiByteToWString(DWORD encoding, const std::string& a)
 366 {
 367     WCharVec result;
 368     DWORD flags = 0;
 369     int requiredWideChars = MultiByteToWideChar(encoding, flags,
 370                         a.c_str(), a.size(),
 371                         NULL, 0);
 372     result.resize(requiredWideChars);
 373     MultiByteToWideChar(encoding, flags, a.c_str(), a.size(),
 374                         result.data(), result.size());
 375     return result;
 376 }
 377
 378 WCharVec convertUtf8ToWString(const std::string& a)
 379 {
 380     return convertMultiByteToWString(CP_UTF8, a);
 381 }
 382
 383 #endif
 384
 385 std::string convertWindowsLocal8BitToUtf8(const std::string& a)
 386 {
 387 #ifdef SG_WINDOWS
 388     DWORD flags = 0;
 389     WCharVec wideString = convertMultiByteToWString(CP_ACP, a);
 390
 391     // convert down to UTF-8
 392     std::vector<char> result;
 393     int requiredUTF8Chars = WideCharToMultiByte(CP_UTF8, flags,
 394                                                 wideString.data(), wideString.size(),
 395                                                 NULL, 0, NULL, NULL);
 396     result.resize(requiredUTF8Chars);
 397     WideCharToMultiByte(CP_UTF8, flags,
 398                         wideString.data(), wideString.size(),
 399                         result.data(), result.size(), NULL, NULL);
 400     return std::string(result.data(), result.size());
 401 #else
 402     return a;
 403 #endif
 404 }
 405
 406 //------------------------------------------------------------------------------
 407 std::string md5(const unsigned char* data, size_t num)
 408 {
 409   SG_MD5_CTX md5_ctx;
 410   SG_MD5Init(&md5_ctx);
 411   SG_MD5Update(&md5_ctx, data, num);
 412
 413   unsigned char digest[MD5_DIGEST_LENGTH];
 414   SG_MD5Final(digest, &md5_ctx);
 415
 416   return encodeHex(digest, MD5_DIGEST_LENGTH);
 417 }
 418
 419 //------------------------------------------------------------------------------
 420 std::string md5(const char* data, size_t num)
 421 {
 422   return md5(reinterpret_cast<const unsigned char*>(data), num);
 423 }
 424
 425 //------------------------------------------------------------------------------
 426 std::string md5(const std::string& str)
 427 {
 428   return md5(reinterpret_cast<const unsigned char*>(str.c_str()), str.size());
 429 }
 430
 431 //------------------------------------------------------------------------------
 432 static const std::string base64_chars =
 433 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 434 "abcdefghijklmnopqrstuvwxyz"
 435 "0123456789+/";
 436
 437 static const unsigned char base64_decode_map[128] =
 438 {
 439     127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
 440     127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
 441     127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
 442     127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
 443     127, 127, 127,  62, 127, 127, 127,  63,  52,  53,
 444     54,  55,  56,  57,  58,  59,  60,  61, 127, 127,
 445     127,  64, 127, 127, 127,   0,   1,   2,   3,   4,
 446     5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
 447     15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
 448     25, 127, 127, 127, 127, 127, 127,  26,  27,  28,
 449     29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
 450     39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
 451     49,  50,  51, 127, 127, 127, 127, 127
 452 };
 453
 454
 455 static inline bool is_base64(unsigned char c) {
 456   return (isalnum(c) || (c == '+') || (c == '/'));
 457 }
 458
 459 static bool is_whitespace(unsigned char c) {
 460     return ((c == ' ') || (c == '\r') || (c == '\n'));
 461 }
 462
 463 void decodeBase64(const std::string& encoded_string, std::vector<unsigned char>& ret)
 464 {
 465   int in_len = encoded_string.size();
 466   int i = 0;
 467   int j = 0;
 468   int in_ = 0;
 469   unsigned char char_array_4[4], char_array_3[3];
 470
 471   while (in_len-- && ( encoded_string[in_] != '=')) {
 472     if (is_whitespace( encoded_string[in_])) {
 473         in_++;
 474         continue;
 475     }
 476
 477     if (!is_base64(encoded_string[in_])) {
 478         break;
 479     }
 480
 481     char_array_4[i++] = encoded_string[in_]; in_++;
 482     if (i ==4) {
 483       for (i = 0; i <4; i++)
 484         char_array_4[i] = base64_decode_map[char_array_4[i]];
 485
 486       char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
 487       char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
 488       char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
 489
 490       for (i = 0; (i < 3); i++)
 491         ret.push_back(char_array_3[i]);
 492       i = 0;
 493     }
 494   }
 495
 496   if (i) {
 497     for (j = i; j <4; j++)
 498       char_array_4[j] = 0;
 499
 500     for (j = 0; j <4; j++)
 501       char_array_4[j] = base64_decode_map[char_array_4[j]];
 502
 503     char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
 504     char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
 505     char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
 506
 507     for (j = 0; (j < i - 1); j++) ret.push_back(char_array_3[j]);
 508   }
 509 }
 510
 511 //------------------------------------------------------------------------------
 512 const char hexChar[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
 513
 514 std::string encodeHex(const std::string& bytes)
 515 {
 516   return encodeHex(
 517     reinterpret_cast<const unsigned char*>(bytes.c_str()),
 518     bytes.size()
 519   );
 520 }
 521
 522 std::string encodeHex(const unsigned char* rawBytes, unsigned int length)
 523 {
 524   std::string hex(length * 2, '\0');
 525   for (unsigned int i=0; i<length;++i) {
 526       unsigned char c = *rawBytes++;
 527       hex[i * 2] = hexChar[c >> 4];
 528       hex[i * 2 + 1] = hexChar[c & 0x0f];
 529   }
 530
 531   return hex;
 532 }
 533
 534 //------------------------------------------------------------------------------
 535 std::string unescape(const char* s)
 536 {
 537   std::string r;
 538   while( *s )
 539   {
 540     if( *s != '\\' )
 541     {
 542       r += *s++;
 543       continue;
 544     }
 545
 546     if( !*++s )
 547       break;
 548
 549     if (*s == '\\') {
 550         r += '\\';
 551     } else if (*s == 'n') {
 552         r += '\n';
 553     } else if (*s == 'r') {
 554         r += '\r';
 555     } else if (*s == 't') {
 556         r += '\t';
 557     } else if (*s == 'v') {
 558         r += '\v';
 559     } else if (*s == 'f') {
 560         r += '\f';
 561     } else if (*s == 'a') {
 562         r += '\a';
 563     } else if (*s == 'b') {
 564         r += '\b';
 565     } else if (*s == 'x') {
 566         if (!*++s)
 567             break;
 568         int v = 0;
 569         for (int i = 0; i < 2 && isxdigit(*s); i++, s++)
 570             v = v * 16 + (isdigit(*s) ? *s - '0' : 10 + tolower(*s) - 'a');
 571         r += v;
 572         continue;
 573
 574     } else if (*s >= '0' && *s <= '7') {
 575         int v = *s++ - '0';
 576         for (int i = 0; i < 3 && *s >= '0' && *s <= '7'; i++, s++)
 577             v = v * 8 + *s - '0';
 578         r += v;
 579         continue;
 580
 581     } else {
 582         r += *s;
 583     }
 584     s++;
 585   }
 586   return r;
 587 }
 588
 589 string sanitizePrintfFormat(const string& input)
 590 {
 591     string::size_type i = input.find("%n");
 592     if (i != string::npos) {
 593         SG_LOG(SG_IO, SG_WARN, "sanitizePrintfFormat: bad format string:" << input);
 594         return string();
 595     }
 596
 597     return input;
 598 }
 599
 600 } // end namespace strutils
 601
 602 } // end namespace simgear