simgear/xml/xmltok.c

   1 /*
   2 The contents of this file are subject to the Mozilla Public License
   3 Version 1.1 (the "License"); you may not use this file except in
   4 compliance with the License. You may obtain a copy of the License at
   5 http://www.mozilla.org/MPL/
   6
   7 Software distributed under the License is distributed on an "AS IS"
   8 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
   9 License for the specific language governing rights and limitations
  10 under the License.
  11
  12 The Original Code is expat.
  13
  14 The Initial Developer of the Original Code is James Clark.
  15 Portions created by James Clark are Copyright (C) 1998, 1999
  16 James Clark. All Rights Reserved.
  17
  18 Contributor(s):
  19
  20 Alternatively, the contents of this file may be used under the terms
  21 of the GNU General Public License (the "GPL"), in which case the
  22 provisions of the GPL are applicable instead of those above.  If you
  23 wish to allow use of your version of this file only under the terms of
  24 the GPL and not to allow others to use your version of this file under
  25 the MPL, indicate your decision by deleting the provisions above and
  26 replace them with the notice and other provisions required by the
  27 GPL. If you do not delete the provisions above, a recipient may use
  28 your version of this file under either the MPL or the GPL.
  29 */
  30
  31 #include "xmldef.h"
  32 #include "xmltok.h"
  33 #include "nametab.h"
  34
  35 #define VTABLE1 \
  36   { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
  37   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
  38   PREFIX(sameName), \
  39   PREFIX(nameMatchesAscii), \
  40   PREFIX(nameLength), \
  41   PREFIX(skipS), \
  42   PREFIX(getAtts), \
  43   PREFIX(charRefNumber), \
  44   PREFIX(predefinedEntityName), \
  45   PREFIX(updatePosition), \
  46   PREFIX(isPublicId)
  47
  48 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
  49
  50 #define UCS2_GET_NAMING(pages, hi, lo) \
  51    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
  52
  53 /* A 2 byte UTF-8 representation splits the characters 11 bits
  54 between the bottom 5 and 6 bits of the bytes.
  55 We need 8 bits to index into pages, 3 bits to add to that index and
  56 5 bits to generate the mask. */
  57 #define UTF8_GET_NAMING2(pages, byte) \
  58     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
  59                       + ((((byte)[0]) & 3) << 1) \
  60                       + ((((byte)[1]) >> 5) & 1)] \
  61          & (1 << (((byte)[1]) & 0x1F)))
  62
  63 /* A 3 byte UTF-8 representation splits the characters 16 bits
  64 between the bottom 4, 6 and 6 bits of the bytes.
  65 We need 8 bits to index into pages, 3 bits to add to that index and
  66 5 bits to generate the mask. */
  67 #define UTF8_GET_NAMING3(pages, byte) \
  68   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
  69                              + ((((byte)[1]) >> 2) & 0xF)] \
  70                        << 3) \
  71                       + ((((byte)[1]) & 3) << 1) \
  72                       + ((((byte)[2]) >> 5) & 1)] \
  73          & (1 << (((byte)[2]) & 0x1F)))
  74
  75 #define UTF8_GET_NAMING(pages, p, n) \
  76   ((n) == 2 \
  77   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
  78   : ((n) == 3 \
  79      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
  80      : 0))
  81
  82 #define UTF8_INVALID3(p) \
  83   ((*p) == 0xED \
  84   ? (((p)[1] & 0x20) != 0) \
  85   : ((*p) == 0xEF \
  86      ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
  87      : 0))
  88
  89 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
  90
  91 static
  92 int isNever(const ENCODING *enc, const char *p)
  93 {
  94   return 0;
  95 }
  96
  97 static
  98 int utf8_isName2(const ENCODING *enc, const char *p)
  99 {
 100   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
 101 }
 102
 103 static
 104 int utf8_isName3(const ENCODING *enc, const char *p)
 105 {
 106   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
 107 }
 108
 109 #define utf8_isName4 isNever
 110
 111 static
 112 int utf8_isNmstrt2(const ENCODING *enc, const char *p)
 113 {
 114   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
 115 }
 116
 117 static
 118 int utf8_isNmstrt3(const ENCODING *enc, const char *p)
 119 {
 120   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
 121 }
 122
 123 #define utf8_isNmstrt4 isNever
 124
 125 #define utf8_isInvalid2 isNever
 126
 127 static
 128 int utf8_isInvalid3(const ENCODING *enc, const char *p)
 129 {
 130   return UTF8_INVALID3((const unsigned char *)p);
 131 }
 132
 133 static
 134 int utf8_isInvalid4(const ENCODING *enc, const char *p)
 135 {
 136   return UTF8_INVALID4((const unsigned char *)p);
 137 }
 138
 139 struct normal_encoding {
 140   ENCODING enc;
 141   unsigned char type[256];
 142 #ifdef XML_MIN_SIZE
 143   int (*byteType)(const ENCODING *, const char *);
 144   int (*isNameMin)(const ENCODING *, const char *);
 145   int (*isNmstrtMin)(const ENCODING *, const char *);
 146   int (*byteToAscii)(const ENCODING *, const char *);
 147   int (*charMatches)(const ENCODING *, const char *, int);
 148 #endif /* XML_MIN_SIZE */
 149   int (*isName2)(const ENCODING *, const char *);
 150   int (*isName3)(const ENCODING *, const char *);
 151   int (*isName4)(const ENCODING *, const char *);
 152   int (*isNmstrt2)(const ENCODING *, const char *);
 153   int (*isNmstrt3)(const ENCODING *, const char *);
 154   int (*isNmstrt4)(const ENCODING *, const char *);
 155   int (*isInvalid2)(const ENCODING *, const char *);
 156   int (*isInvalid3)(const ENCODING *, const char *);
 157   int (*isInvalid4)(const ENCODING *, const char *);
 158 };
 159
 160 #ifdef XML_MIN_SIZE
 161
 162 #define STANDARD_VTABLE(E) \
 163  E ## byteType, \
 164  E ## isNameMin, \
 165  E ## isNmstrtMin, \
 166  E ## byteToAscii, \
 167  E ## charMatches,
 168
 169 #else
 170
 171 #define STANDARD_VTABLE(E) /* as nothing */
 172
 173 #endif
 174
 175 #define NORMAL_VTABLE(E) \
 176  E ## isName2, \
 177  E ## isName3, \
 178  E ## isName4, \
 179  E ## isNmstrt2, \
 180  E ## isNmstrt3, \
 181  E ## isNmstrt4, \
 182  E ## isInvalid2, \
 183  E ## isInvalid3, \
 184  E ## isInvalid4
 185
 186 static int checkCharRefNumber(int);
 187
 188 #include "xmltok_impl.h"
 189
 190 #ifdef XML_MIN_SIZE
 191 #define sb_isNameMin isNever
 192 #define sb_isNmstrtMin isNever
 193 #endif
 194
 195 #ifdef XML_MIN_SIZE
 196 #define MINBPC(enc) ((enc)->minBytesPerChar)
 197 #else
 198 /* minimum bytes per character */
 199 #define MINBPC(enc) 1
 200 #endif
 201
 202 #define SB_BYTE_TYPE(enc, p) \
 203   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
 204
 205 #ifdef XML_MIN_SIZE
 206 static
 207 int sb_byteType(const ENCODING *enc, const char *p)
 208 {
 209   return SB_BYTE_TYPE(enc, p);
 210 }
 211 #define BYTE_TYPE(enc, p) \
 212  (((const struct normal_encoding *)(enc))->byteType(enc, p))
 213 #else
 214 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
 215 #endif
 216
 217 #ifdef XML_MIN_SIZE
 218 #define BYTE_TO_ASCII(enc, p) \
 219  (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
 220 static
 221 int sb_byteToAscii(const ENCODING *enc, const char *p)
 222 {
 223   return *p;
 224 }
 225 #else
 226 #define BYTE_TO_ASCII(enc, p) (*p)
 227 #endif
 228
 229 #define IS_NAME_CHAR(enc, p, n) \
 230  (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
 231 #define IS_NMSTRT_CHAR(enc, p, n) \
 232  (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
 233 #define IS_INVALID_CHAR(enc, p, n) \
 234  (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
 235
 236 #ifdef XML_MIN_SIZE
 237 #define IS_NAME_CHAR_MINBPC(enc, p) \
 238  (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
 239 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
 240  (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
 241 #else
 242 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
 243 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
 244 #endif
 245
 246 #ifdef XML_MIN_SIZE
 247 #define CHAR_MATCHES(enc, p, c) \
 248  (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
 249 static
 250 int sb_charMatches(const ENCODING *enc, const char *p, int c)
 251 {
 252   return *p == c;
 253 }
 254 #else
 255 /* c is an ASCII character */
 256 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
 257 #endif
 258
 259 #define PREFIX(ident) normal_ ## ident
 260 #include "xmltok_impl.c"
 261
 262 #undef MINBPC
 263 #undef BYTE_TYPE
 264 #undef BYTE_TO_ASCII
 265 #undef CHAR_MATCHES
 266 #undef IS_NAME_CHAR
 267 #undef IS_NAME_CHAR_MINBPC
 268 #undef IS_NMSTRT_CHAR
 269 #undef IS_NMSTRT_CHAR_MINBPC
 270 #undef IS_INVALID_CHAR
 271
 272 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
 273   UTF8_cval1 = 0x00,
 274   UTF8_cval2 = 0xc0,
 275   UTF8_cval3 = 0xe0,
 276   UTF8_cval4 = 0xf0
 277 };
 278
 279 static
 280 void utf8_toUtf8(const ENCODING *enc,
 281                  const char **fromP, const char *fromLim,
 282                  char **toP, const char *toLim)
 283 {
 284   char *to;
 285   const char *from;
 286   if (fromLim - *fromP > toLim - *toP) {
 287     /* Avoid copying partial characters. */
 288     for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
 289       if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
 290         break;
 291   }
 292   for (to = *toP, from = *fromP; from != fromLim; from++, to++)
 293     *to = *from;
 294   *fromP = from;
 295   *toP = to;
 296 }
 297
 298 static
 299 void utf8_toUtf16(const ENCODING *enc,
 300                   const char **fromP, const char *fromLim,
 301                   unsigned short **toP, const unsigned short *toLim)
 302 {
 303   unsigned short *to = *toP;
 304   const char *from = *fromP;
 305   while (from != fromLim && to != toLim) {
 306     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
 307     case BT_LEAD2:
 308       *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
 309       from += 2;
 310       break;
 311     case BT_LEAD3:
 312       *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
 313       from += 3;
 314       break;
 315     case BT_LEAD4:
 316       {
 317         unsigned long n;
 318         if (to + 1 == toLim)
 319           break;
 320         n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
 321         n -= 0x10000;
 322         to[0] = (unsigned short)((n >> 10) | 0xD800);
 323         to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
 324         to += 2;
 325         from += 4;
 326       }
 327       break;
 328     default:
 329       *to++ = *from++;
 330       break;
 331     }
 332   }
 333   *fromP = from;
 334   *toP = to;
 335 }
 336
 337 #ifdef XML_NS
 338 static const struct normal_encoding utf8_encoding_ns = {
 339   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
 340   {
 341 #include "asciitab.h"
 342 #include "utf8tab.h"
 343   },
 344   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
 345 };
 346 #endif
 347
 348 static const struct normal_encoding utf8_encoding = {
 349   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
 350   {
 351 #define BT_COLON BT_NMSTRT
 352 #include "asciitab.h"
 353 #undef BT_COLON
 354 #include "utf8tab.h"
 355   },
 356   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
 357 };
 358
 359 #ifdef XML_NS
 360
 361 static const struct normal_encoding internal_utf8_encoding_ns = {
 362   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
 363   {
 364 #include "iasciitab.h"
 365 #include "utf8tab.h"
 366   },
 367   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
 368 };
 369
 370 #endif
 371
 372 static const struct normal_encoding internal_utf8_encoding = {
 373   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
 374   {
 375 #define BT_COLON BT_NMSTRT
 376 #include "iasciitab.h"
 377 #undef BT_COLON
 378 #include "utf8tab.h"
 379   },
 380   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
 381 };
 382
 383 static
 384 void latin1_toUtf8(const ENCODING *enc,
 385                    const char **fromP, const char *fromLim,
 386                    char **toP, const char *toLim)
 387 {
 388   for (;;) {
 389     unsigned char c;
 390     if (*fromP == fromLim)
 391       break;
 392     c = (unsigned char)**fromP;
 393     if (c & 0x80) {
 394       if (toLim - *toP < 2)
 395         break;
 396       *(*toP)++ = ((c >> 6) | UTF8_cval2);
 397       *(*toP)++ = ((c & 0x3f) | 0x80);
 398       (*fromP)++;
 399     }
 400     else {
 401       if (*toP == toLim)
 402         break;
 403       *(*toP)++ = *(*fromP)++;
 404     }
 405   }
 406 }
 407
 408 static
 409 void latin1_toUtf16(const ENCODING *enc,
 410                     const char **fromP, const char *fromLim,
 411                     unsigned short **toP, const unsigned short *toLim)
 412 {
 413   while (*fromP != fromLim && *toP != toLim)
 414     *(*toP)++ = (unsigned char)*(*fromP)++;
 415 }
 416
 417 #ifdef XML_NS
 418
 419 static const struct normal_encoding latin1_encoding_ns = {
 420   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
 421   {
 422 #include "asciitab.h"
 423 #include "latin1tab.h"
 424   },
 425   STANDARD_VTABLE(sb_)
 426 };
 427
 428 #endif
 429
 430 static const struct normal_encoding latin1_encoding = {
 431   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
 432   {
 433 #define BT_COLON BT_NMSTRT
 434 #include "asciitab.h"
 435 #undef BT_COLON
 436 #include "latin1tab.h"
 437   },
 438   STANDARD_VTABLE(sb_)
 439 };
 440
 441 static
 442 void ascii_toUtf8(const ENCODING *enc,
 443                   const char **fromP, const char *fromLim,
 444                   char **toP, const char *toLim)
 445 {
 446   while (*fromP != fromLim && *toP != toLim)
 447     *(*toP)++ = *(*fromP)++;
 448 }
 449
 450 #ifdef XML_NS
 451
 452 static const struct normal_encoding ascii_encoding_ns = {
 453   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
 454   {
 455 #include "asciitab.h"
 456 /* BT_NONXML == 0 */
 457   },
 458   STANDARD_VTABLE(sb_)
 459 };
 460
 461 #endif
 462
 463 static const struct normal_encoding ascii_encoding = {
 464   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
 465   {
 466 #define BT_COLON BT_NMSTRT
 467 #include "asciitab.h"
 468 #undef BT_COLON
 469 /* BT_NONXML == 0 */
 470   },
 471   STANDARD_VTABLE(sb_)
 472 };
 473
 474 static int unicode_byte_type(char hi, char lo)
 475 {
 476   switch ((unsigned char)hi) {
 477   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
 478     return BT_LEAD4;
 479   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
 480     return BT_TRAIL;
 481   case 0xFF:
 482     switch ((unsigned char)lo) {
 483     case 0xFF:
 484     case 0xFE:
 485       return BT_NONXML;
 486     }
 487     break;
 488   }
 489   return BT_NONASCII;
 490 }
 491
 492 #define DEFINE_UTF16_TO_UTF8(E) \
 493 static \
 494 void E ## toUtf8(const ENCODING *enc, \
 495                  const char **fromP, const char *fromLim, \
 496                  char **toP, const char *toLim) \
 497 { \
 498   const char *from; \
 499   for (from = *fromP; from != fromLim; from += 2) { \
 500     int plane; \
 501     unsigned char lo2; \
 502     unsigned char lo = GET_LO(from); \
 503     unsigned char hi = GET_HI(from); \
 504     switch (hi) { \
 505     case 0: \
 506       if (lo < 0x80) { \
 507         if (*toP == toLim) { \
 508           *fromP = from; \
 509           return; \
 510         } \
 511         *(*toP)++ = lo; \
 512         break; \
 513       } \
 514       /* fall through */ \
 515     case 0x1: case 0x2: case 0x3: \
 516     case 0x4: case 0x5: case 0x6: case 0x7: \
 517       if (toLim -  *toP < 2) { \
 518         *fromP = from; \
 519         return; \
 520       } \
 521       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
 522       *(*toP)++ = ((lo & 0x3f) | 0x80); \
 523       break; \
 524     default: \
 525       if (toLim -  *toP < 3)  { \
 526         *fromP = from; \
 527         return; \
 528       } \
 529       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
 530       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
 531       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
 532       *(*toP)++ = ((lo & 0x3f) | 0x80); \
 533       break; \
 534     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
 535       if (toLim -  *toP < 4) { \
 536         *fromP = from; \
 537         return; \
 538       } \
 539       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
 540       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
 541       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
 542       from += 2; \
 543       lo2 = GET_LO(from); \
 544       *(*toP)++ = (((lo & 0x3) << 4) \
 545                    | ((GET_HI(from) & 0x3) << 2) \
 546                    | (lo2 >> 6) \
 547                    | 0x80); \
 548       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
 549       break; \
 550     } \
 551   } \
 552   *fromP = from; \
 553 }
 554
 555 #define DEFINE_UTF16_TO_UTF16(E) \
 556 static \
 557 void E ## toUtf16(const ENCODING *enc, \
 558                   const char **fromP, const char *fromLim, \
 559                   unsigned short **toP, const unsigned short *toLim) \
 560 { \
 561   /* Avoid copying first half only of surrogate */ \
 562   if (fromLim - *fromP > ((toLim - *toP) << 1) \
 563       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
 564     fromLim -= 2; \
 565   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
 566     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
 567 }
 568
 569 #define SET2(ptr, ch) \
 570   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
 571 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
 572 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
 573
 574 DEFINE_UTF16_TO_UTF8(little2_)
 575 DEFINE_UTF16_TO_UTF16(little2_)
 576
 577 #undef SET2
 578 #undef GET_LO
 579 #undef GET_HI
 580
 581 #define SET2(ptr, ch) \
 582   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
 583 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
 584 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
 585
 586 DEFINE_UTF16_TO_UTF8(big2_)
 587 DEFINE_UTF16_TO_UTF16(big2_)
 588
 589 #undef SET2
 590 #undef GET_LO
 591 #undef GET_HI
 592
 593 #define LITTLE2_BYTE_TYPE(enc, p) \
 594  ((p)[1] == 0 \
 595   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
 596   : unicode_byte_type((p)[1], (p)[0]))
 597 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
 598 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
 599 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
 600   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
 601 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
 602   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
 603
 604 #ifdef XML_MIN_SIZE
 605
 606 static
 607 int little2_byteType(const ENCODING *enc, const char *p)
 608 {
 609   return LITTLE2_BYTE_TYPE(enc, p);
 610 }
 611
 612 static
 613 int little2_byteToAscii(const ENCODING *enc, const char *p)
 614 {
 615   return LITTLE2_BYTE_TO_ASCII(enc, p);
 616 }
 617
 618 static
 619 int little2_charMatches(const ENCODING *enc, const char *p, int c)
 620 {
 621   return LITTLE2_CHAR_MATCHES(enc, p, c);
 622 }
 623
 624 static
 625 int little2_isNameMin(const ENCODING *enc, const char *p)
 626 {
 627   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
 628 }
 629
 630 static
 631 int little2_isNmstrtMin(const ENCODING *enc, const char *p)
 632 {
 633   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
 634 }
 635
 636 #undef VTABLE
 637 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
 638
 639 #else /* not XML_MIN_SIZE */
 640
 641 #undef PREFIX
 642 #define PREFIX(ident) little2_ ## ident
 643 #define MINBPC(enc) 2
 644 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
 645 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
 646 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
 647 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
 648 #define IS_NAME_CHAR(enc, p, n) 0
 649 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
 650 #define IS_NMSTRT_CHAR(enc, p, n) (0)
 651 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
 652
 653 #include "xmltok_impl.c"
 654
 655 #undef MINBPC
 656 #undef BYTE_TYPE
 657 #undef BYTE_TO_ASCII
 658 #undef CHAR_MATCHES
 659 #undef IS_NAME_CHAR
 660 #undef IS_NAME_CHAR_MINBPC
 661 #undef IS_NMSTRT_CHAR
 662 #undef IS_NMSTRT_CHAR_MINBPC
 663 #undef IS_INVALID_CHAR
 664
 665 #endif /* not XML_MIN_SIZE */
 666
 667 #ifdef XML_NS
 668
 669 static const struct normal_encoding little2_encoding_ns = {
 670   { VTABLE, 2, 0,
 671 #if XML_BYTE_ORDER == 12
 672     1
 673 #else
 674     0
 675 #endif
 676   },
 677   {
 678 #include "asciitab.h"
 679 #include "latin1tab.h"
 680   },
 681   STANDARD_VTABLE(little2_)
 682 };
 683
 684 #endif
 685
 686 static const struct normal_encoding little2_encoding = {
 687   { VTABLE, 2, 0,
 688 #if XML_BYTE_ORDER == 12
 689     1
 690 #else
 691     0
 692 #endif
 693   },
 694   {
 695 #define BT_COLON BT_NMSTRT
 696 #include "asciitab.h"
 697 #undef BT_COLON
 698 #include "latin1tab.h"
 699   },
 700   STANDARD_VTABLE(little2_)
 701 };
 702
 703 #if XML_BYTE_ORDER != 21
 704
 705 #ifdef XML_NS
 706
 707 static const struct normal_encoding internal_little2_encoding_ns = {
 708   { VTABLE, 2, 0, 1 },
 709   {
 710 #include "iasciitab.h"
 711 #include "latin1tab.h"
 712   },
 713   STANDARD_VTABLE(little2_)
 714 };
 715
 716 #endif
 717
 718 static const struct normal_encoding internal_little2_encoding = {
 719   { VTABLE, 2, 0, 1 },
 720   {
 721 #define BT_COLON BT_NMSTRT
 722 #include "iasciitab.h"
 723 #undef BT_COLON
 724 #include "latin1tab.h"
 725   },
 726   STANDARD_VTABLE(little2_)
 727 };
 728
 729 #endif
 730
 731
 732 #define BIG2_BYTE_TYPE(enc, p) \
 733  ((p)[0] == 0 \
 734   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
 735   : unicode_byte_type((p)[0], (p)[1]))
 736 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
 737 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
 738 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
 739   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
 740 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
 741   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
 742
 743 #ifdef XML_MIN_SIZE
 744
 745 static
 746 int big2_byteType(const ENCODING *enc, const char *p)
 747 {
 748   return BIG2_BYTE_TYPE(enc, p);
 749 }
 750
 751 static
 752 int big2_byteToAscii(const ENCODING *enc, const char *p)
 753 {
 754   return BIG2_BYTE_TO_ASCII(enc, p);
 755 }
 756
 757 static
 758 int big2_charMatches(const ENCODING *enc, const char *p, int c)
 759 {
 760   return BIG2_CHAR_MATCHES(enc, p, c);
 761 }
 762
 763 static
 764 int big2_isNameMin(const ENCODING *enc, const char *p)
 765 {
 766   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
 767 }
 768
 769 static
 770 int big2_isNmstrtMin(const ENCODING *enc, const char *p)
 771 {
 772   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
 773 }
 774
 775 #undef VTABLE
 776 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
 777
 778 #else /* not XML_MIN_SIZE */
 779
 780 #undef PREFIX
 781 #define PREFIX(ident) big2_ ## ident
 782 #define MINBPC(enc) 2
 783 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
 784 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
 785 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
 786 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
 787 #define IS_NAME_CHAR(enc, p, n) 0
 788 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
 789 #define IS_NMSTRT_CHAR(enc, p, n) (0)
 790 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
 791
 792 #include "xmltok_impl.c"
 793
 794 #undef MINBPC
 795 #undef BYTE_TYPE
 796 #undef BYTE_TO_ASCII
 797 #undef CHAR_MATCHES
 798 #undef IS_NAME_CHAR
 799 #undef IS_NAME_CHAR_MINBPC
 800 #undef IS_NMSTRT_CHAR
 801 #undef IS_NMSTRT_CHAR_MINBPC
 802 #undef IS_INVALID_CHAR
 803
 804 #endif /* not XML_MIN_SIZE */
 805
 806 #ifdef XML_NS
 807
 808 static const struct normal_encoding big2_encoding_ns = {
 809   { VTABLE, 2, 0,
 810 #if XML_BYTE_ORDER == 21
 811   1
 812 #else
 813   0
 814 #endif
 815   },
 816   {
 817 #include "asciitab.h"
 818 #include "latin1tab.h"
 819   },
 820   STANDARD_VTABLE(big2_)
 821 };
 822
 823 #endif
 824
 825 static const struct normal_encoding big2_encoding = {
 826   { VTABLE, 2, 0,
 827 #if XML_BYTE_ORDER == 21
 828   1
 829 #else
 830   0
 831 #endif
 832   },
 833   {
 834 #define BT_COLON BT_NMSTRT
 835 #include "asciitab.h"
 836 #undef BT_COLON
 837 #include "latin1tab.h"
 838   },
 839   STANDARD_VTABLE(big2_)
 840 };
 841
 842 #if XML_BYTE_ORDER != 12
 843
 844 #ifdef XML_NS
 845
 846 static const struct normal_encoding internal_big2_encoding_ns = {
 847   { VTABLE, 2, 0, 1 },
 848   {
 849 #include "iasciitab.h"
 850 #include "latin1tab.h"
 851   },
 852   STANDARD_VTABLE(big2_)
 853 };
 854
 855 #endif
 856
 857 static const struct normal_encoding internal_big2_encoding = {
 858   { VTABLE, 2, 0, 1 },
 859   {
 860 #define BT_COLON BT_NMSTRT
 861 #include "iasciitab.h"
 862 #undef BT_COLON
 863 #include "latin1tab.h"
 864   },
 865   STANDARD_VTABLE(big2_)
 866 };
 867
 868 #endif
 869
 870 #undef PREFIX
 871
 872 static
 873 int streqci(const char *s1, const char *s2)
 874 {
 875   for (;;) {
 876     char c1 = *s1++;
 877     char c2 = *s2++;
 878     if ('a' <= c1 && c1 <= 'z')
 879       c1 += 'A' - 'a';
 880     if ('a' <= c2 && c2 <= 'z')
 881       c2 += 'A' - 'a';
 882     if (c1 != c2)
 883       return 0;
 884     if (!c1)
 885       break;
 886   }
 887   return 1;
 888 }
 889
 890 static
 891 void initUpdatePosition(const ENCODING *enc, const char *ptr,
 892                         const char *end, POSITION *pos)
 893 {
 894   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
 895 }
 896
 897 static
 898 int toAscii(const ENCODING *enc, const char *ptr, const char *end)
 899 {
 900   char buf[1];
 901   char *p = buf;
 902   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
 903   if (p == buf)
 904     return -1;
 905   else
 906     return buf[0];
 907 }
 908
 909 static
 910 int isSpace(int c)
 911 {
 912   switch (c) {
 913   case 0x20:
 914   case 0xD:
 915   case 0xA:
 916   case 0x9:
 917     return 1;
 918   }
 919   return 0;
 920 }
 921
 922 /* Return 1 if there's just optional white space
 923 or there's an S followed by name=val. */
 924 static
 925 int parsePseudoAttribute(const ENCODING *enc,
 926                          const char *ptr,
 927                          const char *end,
 928                          const char **namePtr,
 929                          const char **valPtr,
 930                          const char **nextTokPtr)
 931 {
 932   int c;
 933   char open;
 934   if (ptr == end) {
 935     *namePtr = 0;
 936     return 1;
 937   }
 938   if (!isSpace(toAscii(enc, ptr, end))) {
 939     *nextTokPtr = ptr;
 940     return 0;
 941   }
 942   do {
 943     ptr += enc->minBytesPerChar;
 944   } while (isSpace(toAscii(enc, ptr, end)));
 945   if (ptr == end) {
 946     *namePtr = 0;
 947     return 1;
 948   }
 949   *namePtr = ptr;
 950   for (;;) {
 951     c = toAscii(enc, ptr, end);
 952     if (c == -1) {
 953       *nextTokPtr = ptr;
 954       return 0;
 955     }
 956     if (c == '=')
 957       break;
 958     if (isSpace(c)) {
 959       do {
 960         ptr += enc->minBytesPerChar;
 961       } while (isSpace(c = toAscii(enc, ptr, end)));
 962       if (c != '=') {
 963         *nextTokPtr = ptr;
 964         return 0;
 965       }
 966       break;
 967     }
 968     ptr += enc->minBytesPerChar;
 969   }
 970   if (ptr == *namePtr) {
 971     *nextTokPtr = ptr;
 972     return 0;
 973   }
 974   ptr += enc->minBytesPerChar;
 975   c = toAscii(enc, ptr, end);
 976   while (isSpace(c)) {
 977     ptr += enc->minBytesPerChar;
 978     c = toAscii(enc, ptr, end);
 979   }
 980   if (c != '"' && c != '\'') {
 981     *nextTokPtr = ptr;
 982     return 0;
 983   }
 984   open = c;
 985   ptr += enc->minBytesPerChar;
 986   *valPtr = ptr;
 987   for (;; ptr += enc->minBytesPerChar) {
 988     c = toAscii(enc, ptr, end);
 989     if (c == open)
 990       break;
 991     if (!('a' <= c && c <= 'z')
 992         && !('A' <= c && c <= 'Z')
 993         && !('0' <= c && c <= '9')
 994         && c != '.'
 995         && c != '-'
 996         && c != '_') {
 997       *nextTokPtr = ptr;
 998       return 0;
 999     }
1000   }
1001   *nextTokPtr = ptr + enc->minBytesPerChar;
1002   return 1;
1003 }
1004
1005 static
1006 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1007                                                      const char *,
1008                                                      const char *),
1009                    int isGeneralTextEntity,
1010                    const ENCODING *enc,
1011                    const char *ptr,
1012                    const char *end,
1013                    const char **badPtr,
1014                    const char **versionPtr,
1015                    const char **encodingName,
1016                    const ENCODING **encoding,
1017                    int *standalone)
1018 {
1019   const char *val = 0;
1020   const char *name = 0;
1021   ptr += 5 * enc->minBytesPerChar;
1022   end -= 2 * enc->minBytesPerChar;
1023   if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
1024     *badPtr = ptr;
1025     return 0;
1026   }
1027   if (!XmlNameMatchesAscii(enc, name, "version")) {
1028     if (!isGeneralTextEntity) {
1029       *badPtr = name;
1030       return 0;
1031     }
1032   }
1033   else {
1034     if (versionPtr)
1035       *versionPtr = val;
1036     if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
1037       *badPtr = ptr;
1038       return 0;
1039     }
1040     if (!name) {
1041       if (isGeneralTextEntity) {
1042         /* a TextDecl must have an EncodingDecl */
1043         *badPtr = ptr;
1044         return 0;
1045       }
1046       return 1;
1047     }
1048   }
1049   if (XmlNameMatchesAscii(enc, name, "encoding")) {
1050     int c = toAscii(enc, val, end);
1051     if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
1052       *badPtr = val;
1053       return 0;
1054     }
1055     if (encodingName)
1056       *encodingName = val;
1057     if (encoding)
1058       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1059     if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
1060       *badPtr = ptr;
1061       return 0;
1062     }
1063     if (!name)
1064       return 1;
1065   }
1066   if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
1067     *badPtr = name;
1068     return 0;
1069   }
1070   if (XmlNameMatchesAscii(enc, val, "yes")) {
1071     if (standalone)
1072       *standalone = 1;
1073   }
1074   else if (XmlNameMatchesAscii(enc, val, "no")) {
1075     if (standalone)
1076       *standalone = 0;
1077   }
1078   else {
1079     *badPtr = val;
1080     return 0;
1081   }
1082   while (isSpace(toAscii(enc, ptr, end)))
1083     ptr += enc->minBytesPerChar;
1084   if (ptr != end) {
1085     *badPtr = ptr;
1086     return 0;
1087   }
1088   return 1;
1089 }
1090
1091 static
1092 int checkCharRefNumber(int result)
1093 {
1094   switch (result >> 8) {
1095   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1096   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1097     return -1;
1098   case 0:
1099     if (latin1_encoding.type[result] == BT_NONXML)
1100       return -1;
1101     break;
1102   case 0xFF:
1103     if (result == 0xFFFE || result == 0xFFFF)
1104       return -1;
1105     break;
1106   }
1107   return result;
1108 }
1109
1110 int XmlUtf8Encode(int c, char *buf)
1111 {
1112   enum {
1113     /* minN is minimum legal resulting value for N byte sequence */
1114     min2 = 0x80,
1115     min3 = 0x800,
1116     min4 = 0x10000
1117   };
1118
1119   if (c < 0)
1120     return 0;
1121   if (c < min2) {
1122     buf[0] = (c | UTF8_cval1);
1123     return 1;
1124   }
1125   if (c < min3) {
1126     buf[0] = ((c >> 6) | UTF8_cval2);
1127     buf[1] = ((c & 0x3f) | 0x80);
1128     return 2;
1129   }
1130   if (c < min4) {
1131     buf[0] = ((c >> 12) | UTF8_cval3);
1132     buf[1] = (((c >> 6) & 0x3f) | 0x80);
1133     buf[2] = ((c & 0x3f) | 0x80);
1134     return 3;
1135   }
1136   if (c < 0x110000) {
1137     buf[0] = ((c >> 18) | UTF8_cval4);
1138     buf[1] = (((c >> 12) & 0x3f) | 0x80);
1139     buf[2] = (((c >> 6) & 0x3f) | 0x80);
1140     buf[3] = ((c & 0x3f) | 0x80);
1141     return 4;
1142   }
1143   return 0;
1144 }
1145
1146 int XmlUtf16Encode(int charNum, unsigned short *buf)
1147 {
1148   if (charNum < 0)
1149     return 0;
1150   if (charNum < 0x10000) {
1151     buf[0] = charNum;
1152     return 1;
1153   }
1154   if (charNum < 0x110000) {
1155     charNum -= 0x10000;
1156     buf[0] = (charNum >> 10) + 0xD800;
1157     buf[1] = (charNum & 0x3FF) + 0xDC00;
1158     return 2;
1159   }
1160   return 0;
1161 }
1162
1163 struct unknown_encoding {
1164   struct normal_encoding normal;
1165   int (*convert)(void *userData, const char *p);
1166   void *userData;
1167   unsigned short utf16[256];
1168   char utf8[256][4];
1169 };
1170
1171 int XmlSizeOfUnknownEncoding()
1172 {
1173   return sizeof(struct unknown_encoding);
1174 }
1175
1176 static
1177 int unknown_isName(const ENCODING *enc, const char *p)
1178 {
1179   int c = ((const struct unknown_encoding *)enc)
1180           ->convert(((const struct unknown_encoding *)enc)->userData, p);
1181   if (c & ~0xFFFF)
1182     return 0;
1183   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1184 }
1185
1186 static
1187 int unknown_isNmstrt(const ENCODING *enc, const char *p)
1188 {
1189   int c = ((const struct unknown_encoding *)enc)
1190           ->convert(((const struct unknown_encoding *)enc)->userData, p);
1191   if (c & ~0xFFFF)
1192     return 0;
1193   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1194 }
1195
1196 static
1197 int unknown_isInvalid(const ENCODING *enc, const char *p)
1198 {
1199   int c = ((const struct unknown_encoding *)enc)
1200            ->convert(((const struct unknown_encoding *)enc)->userData, p);
1201   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1202 }
1203
1204 static
1205 void unknown_toUtf8(const ENCODING *enc,
1206                     const char **fromP, const char *fromLim,
1207                     char **toP, const char *toLim)
1208 {
1209   char buf[XML_UTF8_ENCODE_MAX];
1210   for (;;) {
1211     const char *utf8;
1212     int n;
1213     if (*fromP == fromLim)
1214       break;
1215     utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
1216     n = *utf8++;
1217     if (n == 0) {
1218       int c = ((const struct unknown_encoding *)enc)
1219               ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1220       n = XmlUtf8Encode(c, buf);
1221       if (n > toLim - *toP)
1222         break;
1223       utf8 = buf;
1224       *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1225                  - (BT_LEAD2 - 2);
1226     }
1227     else {
1228       if (n > toLim - *toP)
1229         break;
1230       (*fromP)++;
1231     }
1232     do {
1233       *(*toP)++ = *utf8++;
1234     } while (--n != 0);
1235   }
1236 }
1237
1238 static
1239 void unknown_toUtf16(const ENCODING *enc,
1240                      const char **fromP, const char *fromLim,
1241                      unsigned short **toP, const unsigned short *toLim)
1242 {
1243   while (*fromP != fromLim && *toP != toLim) {
1244     unsigned short c
1245       = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
1246     if (c == 0) {
1247       c = (unsigned short)((const struct unknown_encoding *)enc)
1248            ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1249       *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1250                  - (BT_LEAD2 - 2);
1251     }
1252     else
1253       (*fromP)++;
1254     *(*toP)++ = c;
1255   }
1256 }
1257
1258 ENCODING *
1259 XmlInitUnknownEncoding(void *mem,
1260                        int *table,
1261                        int (*convert)(void *userData, const char *p),
1262                        void *userData)
1263 {
1264   int i;
1265   struct unknown_encoding *e = mem;
1266   for (i = 0; i < sizeof(struct normal_encoding); i++)
1267     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1268   for (i = 0; i < 128; i++)
1269     if (latin1_encoding.type[i] != BT_OTHER
1270         && latin1_encoding.type[i] != BT_NONXML
1271         && table[i] != i)
1272       return 0;
1273   for (i = 0; i < 256; i++) {
1274     int c = table[i];
1275     if (c == -1) {
1276       e->normal.type[i] = BT_MALFORM;
1277       /* This shouldn't really get used. */
1278       e->utf16[i] = 0xFFFF;
1279       e->utf8[i][0] = 1;
1280       e->utf8[i][1] = 0;
1281     }
1282     else if (c < 0) {
1283       if (c < -4)
1284         return 0;
1285       e->normal.type[i] = BT_LEAD2 - (c + 2);
1286       e->utf8[i][0] = 0;
1287       e->utf16[i] = 0;
1288     }
1289     else if (c < 0x80) {
1290       if (latin1_encoding.type[c] != BT_OTHER
1291           && latin1_encoding.type[c] != BT_NONXML
1292           && c != i)
1293         return 0;
1294       e->normal.type[i] = latin1_encoding.type[c];
1295       e->utf8[i][0] = 1;
1296       e->utf8[i][1] = (char)c;
1297       e->utf16[i] = c == 0 ? 0xFFFF : c;
1298     }
1299     else if (checkCharRefNumber(c) < 0) {
1300       e->normal.type[i] = BT_NONXML;
1301       /* This shouldn't really get used. */
1302       e->utf16[i] = 0xFFFF;
1303       e->utf8[i][0] = 1;
1304       e->utf8[i][1] = 0;
1305     }
1306     else {
1307       if (c > 0xFFFF)
1308         return 0;
1309       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1310         e->normal.type[i] = BT_NMSTRT;
1311       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1312         e->normal.type[i] = BT_NAME;
1313       else
1314         e->normal.type[i] = BT_OTHER;
1315       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1316       e->utf16[i] = c;
1317     }
1318   }
1319   e->userData = userData;
1320   e->convert = convert;
1321   if (convert) {
1322     e->normal.isName2 = unknown_isName;
1323     e->normal.isName3 = unknown_isName;
1324     e->normal.isName4 = unknown_isName;
1325     e->normal.isNmstrt2 = unknown_isNmstrt;
1326     e->normal.isNmstrt3 = unknown_isNmstrt;
1327     e->normal.isNmstrt4 = unknown_isNmstrt;
1328     e->normal.isInvalid2 = unknown_isInvalid;
1329     e->normal.isInvalid3 = unknown_isInvalid;
1330     e->normal.isInvalid4 = unknown_isInvalid;
1331   }
1332   e->normal.enc.utf8Convert = unknown_toUtf8;
1333   e->normal.enc.utf16Convert = unknown_toUtf16;
1334   return &(e->normal.enc);
1335 }
1336
1337 /* If this enumeration is changed, getEncodingIndex and encodings
1338 must also be changed. */
1339 enum {
1340   UNKNOWN_ENC = -1,
1341   ISO_8859_1_ENC = 0,
1342   US_ASCII_ENC,
1343   UTF_8_ENC,
1344   UTF_16_ENC,
1345   UTF_16BE_ENC,
1346   UTF_16LE_ENC,
1347   /* must match encodingNames up to here */
1348   NO_ENC
1349 };
1350
1351 static
1352 int getEncodingIndex(const char *name)
1353 {
1354   static const char *encodingNames[] = {
1355     "ISO-8859-1",
1356     "US-ASCII",
1357     "UTF-8",
1358     "UTF-16",
1359     "UTF-16BE"
1360     "UTF-16LE",
1361   };
1362   int i;
1363   if (name == 0)
1364     return NO_ENC;
1365   for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
1366     if (streqci(name, encodingNames[i]))
1367       return i;
1368
1369   if (streqci(name, "ASCII"))
1370     return US_ASCII_ENC;
1371
1372   return UNKNOWN_ENC;
1373 }
1374
1375 /* For binary compatibility, we store the index of the encoding specified
1376 at initialization in the isUtf16 member. */
1377
1378 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
1379
1380 /* This is what detects the encoding.
1381 encodingTable maps from encoding indices to encodings;
1382 INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1383 state is XML_CONTENT_STATE if we're parsing an external text entity,
1384 and XML_PROLOG_STATE otherwise.
1385 */
1386
1387
1388 static
1389 int initScan(const ENCODING **encodingTable,
1390              const INIT_ENCODING *enc,
1391              int state,
1392              const char *ptr,
1393              const char *end,
1394              const char **nextTokPtr)
1395 {
1396   const ENCODING **encPtr;
1397
1398   if (ptr == end)
1399     return XML_TOK_NONE;
1400   encPtr = enc->encPtr;
1401   if (ptr + 1 == end) {
1402     /* only a single byte available for auto-detection */
1403     /* a well-formed document entity must have more than one byte */
1404     if (state != XML_CONTENT_STATE)
1405       return XML_TOK_PARTIAL;
1406     /* so we're parsing an external text entity... */
1407     /* if UTF-16 was externally specified, then we need at least 2 bytes */
1408     switch (INIT_ENC_INDEX(enc)) {
1409     case UTF_16_ENC:
1410     case UTF_16LE_ENC:
1411     case UTF_16BE_ENC:
1412       return XML_TOK_PARTIAL;
1413     }
1414     switch ((unsigned char)*ptr) {
1415     case 0xFE:
1416     case 0xFF:
1417     case 0xEF: /* possibly first byte of UTF-8 BOM */
1418       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1419           && state == XML_CONTENT_STATE)
1420         break;
1421       /* fall through */
1422     case 0x00:
1423     case 0x3C:
1424       return XML_TOK_PARTIAL;
1425     }
1426   }
1427   else {
1428     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1429     case 0xFEFF:
1430       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1431           && state == XML_CONTENT_STATE)
1432         break;
1433       *nextTokPtr = ptr + 2;
1434       *encPtr = encodingTable[UTF_16BE_ENC];
1435       return XML_TOK_BOM;
1436     /* 00 3C is handled in the default case */
1437     case 0x3C00:
1438       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1439            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1440           && state == XML_CONTENT_STATE)
1441         break;
1442       *encPtr = encodingTable[UTF_16LE_ENC];
1443       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1444     case 0xFFFE:
1445       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1446           && state == XML_CONTENT_STATE)
1447         break;
1448       *nextTokPtr = ptr + 2;
1449       *encPtr = encodingTable[UTF_16LE_ENC];
1450       return XML_TOK_BOM;
1451     case 0xEFBB:
1452       /* Maybe a UTF-8 BOM (EF BB BF) */
1453       /* If there's an explicitly specified (external) encoding
1454          of ISO-8859-1 or some flavour of UTF-16
1455          and this is an external text entity,
1456          don't look for the BOM,
1457          because it might be a legal data. */
1458       if (state == XML_CONTENT_STATE) {
1459         int e = INIT_ENC_INDEX(enc);
1460         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
1461           break;
1462       }
1463       if (ptr + 2 == end)
1464         return XML_TOK_PARTIAL;
1465       if ((unsigned char)ptr[2] == 0xBF) {
1466         *encPtr = encodingTable[UTF_8_ENC];
1467         return XML_TOK_BOM;
1468       }
1469       break;
1470     default:
1471       if (ptr[0] == '\0') {
1472         /* 0 isn't a legal data character. Furthermore a document entity can only
1473            start with ASCII characters.  So the only way this can fail to be big-endian
1474            UTF-16 if it it's an external parsed general entity that's labelled as
1475            UTF-16LE. */
1476         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1477           break;
1478         *encPtr = encodingTable[UTF_16BE_ENC];
1479         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1480       }
1481       else if (ptr[1] == '\0') {
1482         /* We could recover here in the case:
1483             - parsing an external entity
1484             - second byte is 0
1485             - no externally specified encoding
1486             - no encoding declaration
1487            by assuming UTF-16LE.  But we don't, because this would mean when
1488            presented just with a single byte, we couldn't reliably determine
1489            whether we needed further bytes. */
1490         if (state == XML_CONTENT_STATE)
1491           break;
1492         *encPtr = encodingTable[UTF_16LE_ENC];
1493         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1494       }
1495       break;
1496     }
1497   }
1498   *encPtr = encodingTable[(int)INIT_ENC_INDEX(enc)];
1499   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1500 }
1501
1502
1503 #define NS(x) x
1504 #define ns(x) x
1505 #include "xmltok_ns.c"
1506 #undef NS
1507 #undef ns
1508
1509 #ifdef XML_NS
1510
1511 #define NS(x) x ## NS
1512 #define ns(x) x ## _ns
1513
1514 #include "xmltok_ns.c"
1515
1516 #undef NS
1517 #undef ns
1518
1519 ENCODING *
1520 XmlInitUnknownEncodingNS(void *mem,
1521                          int *table,
1522                          int (*convert)(void *userData, const char *p),
1523                          void *userData)
1524 {
1525   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1526   if (enc)
1527     ((struct normal_encoding *)enc)->type[':'] = BT_COLON;
1528   return enc;
1529 }
1530
1531 #endif /* XML_NS */