library/HTML5/Tokenizer.php

   1 <?php
   2
   3 /*
   4
   5 Copyright 2007 Jeroen van der Meer <http://jero.net/>
   6 Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
   7 Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
   8
   9 Permission is hereby granted, free of charge, to any person obtaining a
  10 copy of this software and associated documentation files (the
  11 "Software"), to deal in the Software without restriction, including
  12 without limitation the rights to use, copy, modify, merge, publish,
  13 distribute, sublicense, and/or sell copies of the Software, and to
  14 permit persons to whom the Software is furnished to do so, subject to
  15 the following conditions:
  16
  17 The above copyright notice and this permission notice shall be included
  18 in all copies or substantial portions of the Software.
  19
  20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  24 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  25 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  26 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  27
  28 */
  29
  30 // Some conventions:
  31 // /* */ indicates verbatim text from the HTML 5 specification
  32 // // indicates regular comments
  33
  34 // all flags are in hyphenated form
  35
  36 class HTML5_Tokenizer {
  37     /**
  38      * Points to an InputStream object.
  39      */
  40     protected $stream;
  41
  42     /**
  43      * Tree builder that the tokenizer emits token to.
  44      */
  45     private $tree;
  46
  47     /**
  48      * Current content model we are parsing as.
  49      */
  50     protected $content_model;
  51
  52     /**
  53      * Current token that is being built, but not yet emitted. Also
  54      * is the last token emitted, if applicable.
  55      */
  56     protected $token;
  57
  58     // These are constants describing the content model
  59     const PCDATA    = 0;
  60     const RCDATA    = 1;
  61     const CDATA     = 2;
  62     const PLAINTEXT = 3;
  63
  64     // These are constants describing tokens
  65     // XXX should probably be moved somewhere else, probably the
  66     // HTML5 class.
  67     const DOCTYPE        = 0;
  68     const STARTTAG       = 1;
  69     const ENDTAG         = 2;
  70     const COMMENT        = 3;
  71     const CHARACTER      = 4;
  72     const SPACECHARACTER = 5;
  73     const EOF            = 6;
  74     const PARSEERROR     = 7;
  75
  76     // These are constants representing bunches of characters.
  77     const ALPHA       = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
  78     const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
  79     const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
  80     const DIGIT       = '0123456789';
  81     const HEX         = '0123456789ABCDEFabcdef';
  82     const WHITESPACE  = "\t\n\x0c ";
  83
  84     /**
  85      * @param $data Data to parse
  86      */
  87     public function __construct($data, $builder = null) {
  88         $this->stream = new HTML5_InputStream($data);
  89         if (!$builder) $this->tree = new HTML5_TreeBuilder;
  90         $this->content_model = self::PCDATA;
  91     }
  92
  93     public function parseFragment($context = null) {
  94         $this->tree->setupContext($context);
  95         if ($this->tree->content_model) {
  96             $this->content_model = $this->tree->content_model;
  97             $this->tree->content_model = null;
  98         }
  99         $this->parse();
 100     }
 101
 102     // XXX maybe convert this into an iterator? regardless, this function
 103     // and the save function should go into a Parser facade of some sort
 104     /**
 105      * Performs the actual parsing of the document.
 106      */
 107     public function parse() {
 108         // Current state
 109         $state = 'data';
 110         // This is used to avoid having to have look-behind in the data state.
 111         $lastFourChars = '';
 112         /**
 113          * Escape flag as specified by the HTML5 specification: "used to
 114          * control the behavior of the tokeniser. It is either true or
 115          * false, and initially must be set to the false state."
 116          */
 117         $escape = false;
 118         //echo "\n\n";
 119         while($state !== null) {
 120
 121             /*echo $state . ' ';
 122             switch ($this->content_model) {
 123                 case self::PCDATA: echo 'PCDATA'; break;
 124                 case self::RCDATA: echo 'RCDATA'; break;
 125                 case self::CDATA: echo 'CDATA'; break;
 126                 case self::PLAINTEXT: echo 'PLAINTEXT'; break;
 127             }
 128             if ($escape) echo " escape";
 129             echo "\n";*/
 130
 131             switch($state) {
 132                 case 'data':
 133
 134                     /* Consume the next input character */
 135                     $char = $this->stream->char();
 136                     $lastFourChars .= $char;
 137                     if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
 138
 139                     // see below for meaning
 140                     $hyp_cond =
 141                         !$escape &&
 142                         (
 143                             $this->content_model === self::RCDATA ||
 144                             $this->content_model === self::CDATA
 145                         );
 146                     $amp_cond =
 147                         !$escape &&
 148                         (
 149                             $this->content_model === self::PCDATA ||
 150                             $this->content_model === self::RCDATA
 151                         );
 152                     $lt_cond =
 153                         $this->content_model === self::PCDATA ||
 154                         (
 155                             (
 156                                 $this->content_model === self::RCDATA ||
 157                                 $this->content_model === self::CDATA
 158                              ) &&
 159                              !$escape
 160                         );
 161                     $gt_cond =
 162                         $escape &&
 163                         (
 164                             $this->content_model === self::RCDATA ||
 165                             $this->content_model === self::CDATA
 166                         );
 167
 168                     if($char === '&' && $amp_cond) {
 169                         /* U+0026 AMPERSAND (&)
 170                         When the content model flag is set to one of the PCDATA or RCDATA
 171                         states and the escape flag is false: switch to the
 172                         character reference data state. Otherwise: treat it as per
 173                         the "anything else" entry below. */
 174                         $state = 'characterReferenceData';
 175
 176                     } elseif(
 177                         $char === '-' &&
 178                         $hyp_cond &&
 179                         $lastFourChars === '<!--'
 180                     ) {
 181                         /*
 182                         U+002D HYPHEN-MINUS (-)
 183                         If the content model flag is set to either the RCDATA state or
 184                         the CDATA state, and the escape flag is false, and there are at
 185                         least three characters before this one in the input stream, and the
 186                         last four characters in the input stream, including this one, are
 187                         U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
 188                         and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
 189                         $escape = true;
 190
 191                         /* In any case, emit the input character as a character token. Stay
 192                         in the data state. */
 193                         $this->emitToken(array(
 194                             'type' => self::CHARACTER,
 195                             'data' => '-'
 196                         ));
 197                         // We do the "any case" part as part of "anything else".
 198
 199                     /* U+003C LESS-THAN SIGN (<) */
 200                     } elseif($char === '<' && $lt_cond) {
 201                         /* When the content model flag is set to the PCDATA state: switch
 202                         to the tag open state.
 203
 204                         When the content model flag is set to either the RCDATA state or
 205                         the CDATA state and the escape flag is false: switch to the tag
 206                         open state.
 207
 208                         Otherwise: treat it as per the "anything else" entry below. */
 209                         $state = 'tagOpen';
 210
 211                     /* U+003E GREATER-THAN SIGN (>) */
 212                     } elseif(
 213                         $char === '>' &&
 214                         $gt_cond &&
 215                         substr($lastFourChars, 1) === '-->'
 216                     ) {
 217                         /* If the content model flag is set to either the RCDATA state or
 218                         the CDATA state, and the escape flag is true, and the last three
 219                         characters in the input stream including this one are U+002D
 220                         HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
 221                         set the escape flag to false. */
 222                         $escape = false;
 223
 224                         /* In any case, emit the input character as a character token.
 225                         Stay in the data state. */
 226                         $this->emitToken(array(
 227                             'type' => self::CHARACTER,
 228                             'data' => '>'
 229                         ));
 230                         // We do the "any case" part as part of "anything else".
 231
 232                     } elseif($char === false) {
 233                         /* EOF
 234                         Emit an end-of-file token. */
 235                         $state = null;
 236                         $this->tree->emitToken(array(
 237                             'type' => self::EOF
 238                         ));
 239
 240                     } elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 241                         // Directly after emitting a token you switch back to the "data
 242                         // state". At that point spaceCharacters are important so they are
 243                         // emitted separately.
 244                         $chars = $this->stream->charsWhile(self::WHITESPACE);
 245                         $this->emitToken(array(
 246                             'type' => self::SPACECHARACTER,
 247                             'data' => $char . $chars
 248                         ));
 249                         $lastFourChars .= $chars;
 250                         if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
 251
 252                     } else {
 253                         /* Anything else
 254                         THIS IS AN OPTIMIZATION: Get as many character that
 255                         otherwise would also be treated as a character token and emit it
 256                         as a single character token. Stay in the data state. */
 257
 258                         $mask = '';
 259                         if ($hyp_cond) $mask .= '-';
 260                         if ($amp_cond) $mask .= '&';
 261                         if ($lt_cond)  $mask .= '<';
 262                         if ($gt_cond)  $mask .= '>';
 263
 264                         if ($mask === '') {
 265                             $chars = $this->stream->remainingChars();
 266                         } else {
 267                             $chars = $this->stream->charsUntil($mask);
 268                         }
 269
 270                         $this->emitToken(array(
 271                             'type' => self::CHARACTER,
 272                             'data' => $char . $chars
 273                         ));
 274
 275                         $lastFourChars .= $chars;
 276                         if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
 277
 278                         $state = 'data';
 279                     }
 280                 break;
 281
 282                 case 'characterReferenceData':
 283                     /* (This cannot happen if the content model flag
 284                     is set to the CDATA state.) */
 285
 286                     /* Attempt to consume a character reference, with no
 287                     additional allowed character. */
 288                     $entity = $this->consumeCharacterReference();
 289
 290                     /* If nothing is returned, emit a U+0026 AMPERSAND
 291                     character token. Otherwise, emit the character token that
 292                     was returned. */
 293                     // This is all done when consuming the character reference.
 294                     $this->emitToken(array(
 295                         'type' => self::CHARACTER,
 296                         'data' => $entity
 297                     ));
 298
 299                     /* Finally, switch to the data state. */
 300                     $state = 'data';
 301                 break;
 302
 303                 case 'tagOpen':
 304                     $char = $this->stream->char();
 305
 306                     switch($this->content_model) {
 307                         case self::RCDATA:
 308                         case self::CDATA:
 309                             /* Consume the next input character. If it is a
 310                             U+002F SOLIDUS (/) character, switch to the close
 311                             tag open state. Otherwise, emit a U+003C LESS-THAN
 312                             SIGN character token and reconsume the current input
 313                             character in the data state. */
 314                             // We consumed above.
 315
 316                             if($char === '/') {
 317                                 $state = 'closeTagOpen';
 318
 319                             } else {
 320                                 $this->emitToken(array(
 321                                     'type' => self::CHARACTER,
 322                                     'data' => '<'
 323                                 ));
 324
 325                                 $this->stream->unget();
 326
 327                                 $state = 'data';
 328                             }
 329                         break;
 330
 331                         case self::PCDATA:
 332                             /* If the content model flag is set to the PCDATA state
 333                             Consume the next input character: */
 334                             // We consumed above.
 335
 336                             if($char === '!') {
 337                                 /* U+0021 EXCLAMATION MARK (!)
 338                                 Switch to the markup declaration open state. */
 339                                 $state = 'markupDeclarationOpen';
 340
 341                             } elseif($char === '/') {
 342                                 /* U+002F SOLIDUS (/)
 343                                 Switch to the close tag open state. */
 344                                 $state = 'closeTagOpen';
 345
 346                             } elseif('A' <= $char && $char <= 'Z') {
 347                                 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
 348                                 Create a new start tag token, set its tag name to the lowercase
 349                                 version of the input character (add 0x0020 to the character's code
 350                                 point), then switch to the tag name state. (Don't emit the token
 351                                 yet; further details will be filled in before it is emitted.) */
 352                                 $this->token = array(
 353                                     'name'  => strtolower($char),
 354                                     'type'  => self::STARTTAG,
 355                                     'attr'  => array()
 356                                 );
 357
 358                                 $state = 'tagName';
 359
 360                             } elseif('a' <= $char && $char <= 'z') {
 361                                 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
 362                                 Create a new start tag token, set its tag name to the input
 363                                 character, then switch to the tag name state. (Don't emit
 364                                 the token yet; further details will be filled in before it
 365                                 is emitted.) */
 366                                 $this->token = array(
 367                                     'name'  => $char,
 368                                     'type'  => self::STARTTAG,
 369                                     'attr'  => array()
 370                                 );
 371
 372                                 $state = 'tagName';
 373
 374                             } elseif($char === '>') {
 375                                 /* U+003E GREATER-THAN SIGN (>)
 376                                 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
 377                                 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
 378                                 $this->emitToken(array(
 379                                     'type' => self::PARSEERROR,
 380                                     'data' => 'expected-tag-name-but-got-right-bracket'
 381                                 ));
 382                                 $this->emitToken(array(
 383                                     'type' => self::CHARACTER,
 384                                     'data' => '<>'
 385                                 ));
 386
 387                                 $state = 'data';
 388
 389                             } elseif($char === '?') {
 390                                 /* U+003F QUESTION MARK (?)
 391                                 Parse error. Switch to the bogus comment state. */
 392                                 $this->emitToken(array(
 393                                     'type' => self::PARSEERROR,
 394                                     'data' => 'expected-tag-name-but-got-question-mark'
 395                                 ));
 396                                 $this->token = array(
 397                                     'data' => '?',
 398                                     'type' => self::COMMENT
 399                                 );
 400                                 $state = 'bogusComment';
 401
 402                             } else {
 403                                 /* Anything else
 404                                 Parse error. Emit a U+003C LESS-THAN SIGN character token and
 405                                 reconsume the current input character in the data state. */
 406                                 $this->emitToken(array(
 407                                     'type' => self::PARSEERROR,
 408                                     'data' => 'expected-tag-name'
 409                                 ));
 410                                 $this->emitToken(array(
 411                                     'type' => self::CHARACTER,
 412                                     'data' => '<'
 413                                 ));
 414
 415                                 $state = 'data';
 416                                 $this->stream->unget();
 417                             }
 418                         break;
 419                     }
 420                 break;
 421
 422                 case 'closeTagOpen':
 423                     if (
 424                         $this->content_model === self::RCDATA ||
 425                         $this->content_model === self::CDATA
 426                     ) {
 427                         /* If the content model flag is set to the RCDATA or CDATA
 428                         states... */
 429                         $name = strtolower($this->stream->charsWhile(self::ALPHA));
 430                         $following = $this->stream->char();
 431                         $this->stream->unget();
 432                         if (
 433                             !$this->token ||
 434                             $this->token['name'] !== $name ||
 435                             $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
 436                         ) {
 437                             /* if no start tag token has ever been emitted by this instance
 438                             of the tokenizer (fragment case), or, if the next few
 439                             characters do not match the tag name of the last start tag
 440                             token emitted (compared in an ASCII case-insensitive manner),
 441                             or if they do but they are not immediately followed by one of
 442                             the following characters:
 443
 444                                 * U+0009 CHARACTER TABULATION
 445                                 * U+000A LINE FEED (LF)
 446                                 * U+000C FORM FEED (FF)
 447                                 * U+0020 SPACE
 448                                 * U+003E GREATER-THAN SIGN (>)
 449                                 * U+002F SOLIDUS (/)
 450                                 * EOF
 451
 452                             ...then emit a U+003C LESS-THAN SIGN character token, a
 453                             U+002F SOLIDUS character token, and switch to the data
 454                             state to process the next input character. */
 455                             // XXX: Probably ought to replace in_array with $following === x ||...
 456
 457                             // We also need to emit $name now we've consumed that, as we
 458                             // know it'll just be emitted as a character token.
 459                             $this->emitToken(array(
 460                                 'type' => self::CHARACTER,
 461                                 'data' => '</' . $name
 462                             ));
 463
 464                             $state = 'data';
 465                         } else {
 466                             // This matches what would happen if we actually did the
 467                             // otherwise below (but we can't because we've consumed too
 468                             // much).
 469
 470                             // Start the end tag token with the name we already have.
 471                             $this->token = array(
 472                                 'name'  => $name,
 473                                 'type'  => self::ENDTAG
 474                             );
 475
 476                             // Change to tag name state.
 477                             $state = 'tagName';
 478                         }
 479                     } elseif ($this->content_model === self::PCDATA) {
 480                         /* Otherwise, if the content model flag is set to the PCDATA
 481                         state [...]: */
 482                         $char = $this->stream->char();
 483
 484                         if ('A' <= $char && $char <= 'Z') {
 485                             /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
 486                             Create a new end tag token, set its tag name to the lowercase version
 487                             of the input character (add 0x0020 to the character's code point), then
 488                             switch to the tag name state. (Don't emit the token yet; further details
 489                             will be filled in before it is emitted.) */
 490                             $this->token = array(
 491                                 'name'  => strtolower($char),
 492                                 'type'  => self::ENDTAG
 493                             );
 494
 495                             $state = 'tagName';
 496
 497                         } elseif ('a' <= $char && $char <= 'z') {
 498                             /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
 499                             Create a new end tag token, set its tag name to the
 500                             input character, then switch to the tag name state.
 501                             (Don't emit the token yet; further details will be
 502                             filled in before it is emitted.) */
 503                             $this->token = array(
 504                                 'name'  => $char,
 505                                 'type'  => self::ENDTAG
 506                             );
 507
 508                             $state = 'tagName';
 509
 510                         } elseif($char === '>') {
 511                             /* U+003E GREATER-THAN SIGN (>)
 512                             Parse error. Switch to the data state. */
 513                             $this->emitToken(array(
 514                                 'type' => self::PARSEERROR,
 515                                 'data' => 'expected-closing-tag-but-got-right-bracket'
 516                             ));
 517                             $state = 'data';
 518
 519                         } elseif($char === false) {
 520                             /* EOF
 521                             Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
 522                             SOLIDUS character token. Reconsume the EOF character in the data state. */
 523                             $this->emitToken(array(
 524                                 'type' => self::PARSEERROR,
 525                                 'data' => 'expected-closing-tag-but-got-eof'
 526                             ));
 527                             $this->emitToken(array(
 528                                 'type' => self::CHARACTER,
 529                                 'data' => '</'
 530                             ));
 531
 532                             $this->stream->unget();
 533                             $state = 'data';
 534
 535                         } else {
 536                             /* Parse error. Switch to the bogus comment state. */
 537                             $this->emitToken(array(
 538                                 'type' => self::PARSEERROR,
 539                                 'data' => 'expected-closing-tag-but-got-char'
 540                             ));
 541                             $this->token = array(
 542                                 'data' => $char,
 543                                 'type' => self::COMMENT
 544                             );
 545                             $state = 'bogusComment';
 546                         }
 547                     }
 548                 break;
 549
 550                 case 'tagName':
 551                     /* Consume the next input character: */
 552                     $char = $this->stream->char();
 553
 554                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 555                         /* U+0009 CHARACTER TABULATION
 556                         U+000A LINE FEED (LF)
 557                         U+000C FORM FEED (FF)
 558                         U+0020 SPACE
 559                         Switch to the before attribute name state. */
 560                         $state = 'beforeAttributeName';
 561
 562                     } elseif($char === '/') {
 563                         /* U+002F SOLIDUS (/)
 564                         Switch to the self-closing start tag state. */
 565                         $state = 'selfClosingStartTag';
 566
 567                     } elseif($char === '>') {
 568                         /* U+003E GREATER-THAN SIGN (>)
 569                         Emit the current tag token. Switch to the data state. */
 570                         $this->emitToken($this->token);
 571                         $state = 'data';
 572
 573                     } elseif('A' <= $char && $char <= 'Z') {
 574                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 575                         Append the lowercase version of the current input
 576                         character (add 0x0020 to the character's code point) to
 577                         the current tag token's tag name. Stay in the tag name state. */
 578                         $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
 579
 580                         $this->token['name'] .= strtolower($char . $chars);
 581                         $state = 'tagName';
 582
 583                     } elseif($char === false) {
 584                         /* EOF
 585                         Parse error. Emit the current tag token. Reconsume the EOF
 586                         character in the data state. */
 587                         $this->emitToken(array(
 588                             'type' => self::PARSEERROR,
 589                             'data' => 'eof-in-tag-name'
 590                         ));
 591                         $this->emitToken($this->token);
 592
 593                         $this->stream->unget();
 594                         $state = 'data';
 595
 596                     } else {
 597                         /* Anything else
 598                         Append the current input character to the current tag token's tag name.
 599                         Stay in the tag name state. */
 600                         $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
 601
 602                         $this->token['name'] .= $char . $chars;
 603                         $state = 'tagName';
 604                     }
 605                 break;
 606
 607                 case 'beforeAttributeName':
 608                     /* Consume the next input character: */
 609                     $char = $this->stream->char();
 610
 611                     // this conditional is optimized, check bottom
 612                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 613                         /* U+0009 CHARACTER TABULATION
 614                         U+000A LINE FEED (LF)
 615                         U+000C FORM FEED (FF)
 616                         U+0020 SPACE
 617                         Stay in the before attribute name state. */
 618                         $state = 'beforeAttributeName';
 619
 620                     } elseif($char === '/') {
 621                         /* U+002F SOLIDUS (/)
 622                         Switch to the self-closing start tag state. */
 623                         $state = 'selfClosingStartTag';
 624
 625                     } elseif($char === '>') {
 626                         /* U+003E GREATER-THAN SIGN (>)
 627                         Emit the current tag token. Switch to the data state. */
 628                         $this->emitToken($this->token);
 629                         $state = 'data';
 630
 631                     } elseif('A' <= $char && $char <= 'Z') {
 632                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 633                         Start a new attribute in the current tag token. Set that
 634                         attribute's name to the lowercase version of the current
 635                         input character (add 0x0020 to the character's code
 636                         point), and its value to the empty string. Switch to the
 637                         attribute name state.*/
 638                         $this->token['attr'][] = array(
 639                             'name'  => strtolower($char),
 640                             'value' => ''
 641                         );
 642
 643                         $state = 'attributeName';
 644
 645                     } elseif($char === false) {
 646                         /* EOF
 647                         Parse error. Emit the current tag token. Reconsume the EOF
 648                         character in the data state. */
 649                         $this->emitToken(array(
 650                             'type' => self::PARSEERROR,
 651                             'data' => 'expected-attribute-name-but-got-eof'
 652                         ));
 653                         $this->emitToken($this->token);
 654
 655                         $this->stream->unget();
 656                         $state = 'data';
 657
 658                     } else {
 659                         /* U+0022 QUOTATION MARK (")
 660                            U+0027 APOSTROPHE (')
 661                            U+003D EQUALS SIGN (=)
 662                         Parse error. Treat it as per the "anything else" entry
 663                         below. */
 664                         if($char === '"' || $char === "'" || $char === '=') {
 665                             $this->emitToken(array(
 666                                 'type' => self::PARSEERROR,
 667                                 'data' => 'invalid-character-in-attribute-name'
 668                             ));
 669                         }
 670
 671                         /* Anything else
 672                         Start a new attribute in the current tag token. Set that attribute's
 673                         name to the current input character, and its value to the empty string.
 674                         Switch to the attribute name state. */
 675                         $this->token['attr'][] = array(
 676                             'name'  => $char,
 677                             'value' => ''
 678                         );
 679
 680                         $state = 'attributeName';
 681                     }
 682                 break;
 683
 684                 case 'attributeName':
 685                     // Consume the next input character:
 686                     $char = $this->stream->char();
 687
 688                     // this conditional is optimized, check bottom
 689                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 690                         /* U+0009 CHARACTER TABULATION
 691                         U+000A LINE FEED (LF)
 692                         U+000C FORM FEED (FF)
 693                         U+0020 SPACE
 694                         Switch to the after attribute name state. */
 695                         $state = 'afterAttributeName';
 696
 697                     } elseif($char === '/') {
 698                         /* U+002F SOLIDUS (/)
 699                         Switch to the self-closing start tag state. */
 700                         $state = 'selfClosingStartTag';
 701
 702                     } elseif($char === '=') {
 703                         /* U+003D EQUALS SIGN (=)
 704                         Switch to the before attribute value state. */
 705                         $state = 'beforeAttributeValue';
 706
 707                     } elseif($char === '>') {
 708                         /* U+003E GREATER-THAN SIGN (>)
 709                         Emit the current tag token. Switch to the data state. */
 710                         $this->emitToken($this->token);
 711                         $state = 'data';
 712
 713                     } elseif('A' <= $char && $char <= 'Z') {
 714                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 715                         Append the lowercase version of the current input
 716                         character (add 0x0020 to the character's code point) to
 717                         the current attribute's name. Stay in the attribute name
 718                         state. */
 719                         $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
 720
 721                         $last = count($this->token['attr']) - 1;
 722                         $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
 723
 724                         $state = 'attributeName';
 725
 726                     } elseif($char === false) {
 727                         /* EOF
 728                         Parse error. Emit the current tag token. Reconsume the EOF
 729                         character in the data state. */
 730                         $this->emitToken(array(
 731                             'type' => self::PARSEERROR,
 732                             'data' => 'eof-in-attribute-name'
 733                         ));
 734                         $this->emitToken($this->token);
 735
 736                         $this->stream->unget();
 737                         $state = 'data';
 738
 739                     } else {
 740                         /* U+0022 QUOTATION MARK (")
 741                            U+0027 APOSTROPHE (')
 742                         Parse error. Treat it as per the "anything else"
 743                         entry below. */
 744                         if($char === '"' || $char === "'") {
 745                             $this->emitToken(array(
 746                                 'type' => self::PARSEERROR,
 747                                 'data' => 'invalid-character-in-attribute-name'
 748                             ));
 749                         }
 750
 751                         /* Anything else
 752                         Append the current input character to the current attribute's name.
 753                         Stay in the attribute name state. */
 754                         $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
 755
 756                         $last = count($this->token['attr']) - 1;
 757                         $this->token['attr'][$last]['name'] .= $char . $chars;
 758
 759                         $state = 'attributeName';
 760                     }
 761
 762                     /* When the user agent leaves the attribute name state
 763                     (and before emitting the tag token, if appropriate), the
 764                     complete attribute's name must be compared to the other
 765                     attributes on the same token; if there is already an
 766                     attribute on the token with the exact same name, then this
 767                     is a parse error and the new attribute must be dropped, along
 768                     with the value that gets associated with it (if any). */
 769                     // this might be implemented in the emitToken method
 770                 break;
 771
 772                 case 'afterAttributeName':
 773                     // Consume the next input character:
 774                     $char = $this->stream->char();
 775
 776                     // this is an optimized conditional, check the bottom
 777                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 778                         /* U+0009 CHARACTER TABULATION
 779                         U+000A LINE FEED (LF)
 780                         U+000C FORM FEED (FF)
 781                         U+0020 SPACE
 782                         Stay in the after attribute name state. */
 783                         $state = 'afterAttributeName';
 784
 785                     } elseif($char === '/') {
 786                         /* U+002F SOLIDUS (/)
 787                         Switch to the self-closing start tag state. */
 788                         $state = 'selfClosingStartTag';
 789
 790                     } elseif($char === '=') {
 791                         /* U+003D EQUALS SIGN (=)
 792                         Switch to the before attribute value state. */
 793                         $state = 'beforeAttributeValue';
 794
 795                     } elseif($char === '>') {
 796                         /* U+003E GREATER-THAN SIGN (>)
 797                         Emit the current tag token. Switch to the data state. */
 798                         $this->emitToken($this->token);
 799                         $state = 'data';
 800
 801                     } elseif('A' <= $char && $char <= 'Z') {
 802                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 803                         Start a new attribute in the current tag token. Set that
 804                         attribute's name to the lowercase version of the current
 805                         input character (add 0x0020 to the character's code
 806                         point), and its value to the empty string. Switch to the
 807                         attribute name state. */
 808                         $this->token['attr'][] = array(
 809                             'name'  => strtolower($char),
 810                             'value' => ''
 811                         );
 812
 813                         $state = 'attributeName';
 814
 815                     } elseif($char === false) {
 816                         /* EOF
 817                         Parse error. Emit the current tag token. Reconsume the EOF
 818                         character in the data state. */
 819                         $this->emitToken(array(
 820                             'type' => self::PARSEERROR,
 821                             'data' => 'expected-end-of-tag-but-got-eof'
 822                         ));
 823                         $this->emitToken($this->token);
 824
 825                         $this->stream->unget();
 826                         $state = 'data';
 827
 828                     } else {
 829                         /* U+0022 QUOTATION MARK (")
 830                            U+0027 APOSTROPHE (')
 831                         Parse error. Treat it as per the "anything else"
 832                         entry below. */
 833                         if($char === '"' || $char === "'") {
 834                             $this->emitToken(array(
 835                                 'type' => self::PARSEERROR,
 836                                 'data' => 'invalid-character-after-attribute-name'
 837                             ));
 838                         }
 839
 840                         /* Anything else
 841                         Start a new attribute in the current tag token. Set that attribute's
 842                         name to the current input character, and its value to the empty string.
 843                         Switch to the attribute name state. */
 844                         $this->token['attr'][] = array(
 845                             'name'  => $char,
 846                             'value' => ''
 847                         );
 848
 849                         $state = 'attributeName';
 850                     }
 851                 break;
 852
 853                 case 'beforeAttributeValue':
 854                     // Consume the next input character:
 855                     $char = $this->stream->char();
 856
 857                     // this is an optimized conditional
 858                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 859                         /* U+0009 CHARACTER TABULATION
 860                         U+000A LINE FEED (LF)
 861                         U+000C FORM FEED (FF)
 862                         U+0020 SPACE
 863                         Stay in the before attribute value state. */
 864                         $state = 'beforeAttributeValue';
 865
 866                     } elseif($char === '"') {
 867                         /* U+0022 QUOTATION MARK (")
 868                         Switch to the attribute value (double-quoted) state. */
 869                         $state = 'attributeValueDoubleQuoted';
 870
 871                     } elseif($char === '&') {
 872                         /* U+0026 AMPERSAND (&)
 873                         Switch to the attribute value (unquoted) state and reconsume
 874                         this input character. */
 875                         $this->stream->unget();
 876                         $state = 'attributeValueUnquoted';
 877
 878                     } elseif($char === '\'') {
 879                         /* U+0027 APOSTROPHE (')
 880                         Switch to the attribute value (single-quoted) state. */
 881                         $state = 'attributeValueSingleQuoted';
 882
 883                     } elseif($char === '>') {
 884                         /* U+003E GREATER-THAN SIGN (>)
 885                         Parse error. Emit the current tag token. Switch to the data state. */
 886                         $this->emitToken(array(
 887                             'type' => self::PARSEERROR,
 888                             'data' => 'expected-attribute-value-but-got-right-bracket'
 889                         ));
 890                         $this->emitToken($this->token);
 891                         $state = 'data';
 892
 893                     } elseif($char === false) {
 894                         /* EOF
 895                         Parse error. Emit the current tag token. Reconsume
 896                         the character in the data state. */
 897                         $this->emitToken(array(
 898                             'type' => self::PARSEERROR,
 899                             'data' => 'expected-attribute-value-but-got-eof'
 900                         ));
 901                         $this->emitToken($this->token);
 902                         $this->stream->unget();
 903                         $state = 'data';
 904
 905                     } else {
 906                         /* U+003D EQUALS SIGN (=)
 907                         Parse error. Treat it as per the "anything else" entry below. */
 908                         if($char === '=') {
 909                             $this->emitToken(array(
 910                                 'type' => self::PARSEERROR,
 911                                 'data' => 'equals-in-unquoted-attribute-value'
 912                             ));
 913                         }
 914
 915                         /* Anything else
 916                         Append the current input character to the current attribute's value.
 917                         Switch to the attribute value (unquoted) state. */
 918                         $last = count($this->token['attr']) - 1;
 919                         $this->token['attr'][$last]['value'] .= $char;
 920
 921                         $state = 'attributeValueUnquoted';
 922                     }
 923                 break;
 924
 925                 case 'attributeValueDoubleQuoted':
 926                     // Consume the next input character:
 927                     $char = $this->stream->char();
 928
 929                     if($char === '"') {
 930                         /* U+0022 QUOTATION MARK (")
 931                         Switch to the after attribute value (quoted) state. */
 932                         $state = 'afterAttributeValueQuoted';
 933
 934                     } elseif($char === '&') {
 935                         /* U+0026 AMPERSAND (&)
 936                         Switch to the character reference in attribute value
 937                         state, with the additional allowed character
 938                         being U+0022 QUOTATION MARK ("). */
 939                         $this->characterReferenceInAttributeValue('"');
 940
 941                     } elseif($char === false) {
 942                         /* EOF
 943                         Parse error. Emit the current tag token. Reconsume the character
 944                         in the data state. */
 945                         $this->emitToken(array(
 946                             'type' => self::PARSEERROR,
 947                             'data' => 'eof-in-attribute-value-double-quote'
 948                         ));
 949                         $this->emitToken($this->token);
 950
 951                         $this->stream->unget();
 952                         $state = 'data';
 953
 954                     } else {
 955                         /* Anything else
 956                         Append the current input character to the current attribute's value.
 957                         Stay in the attribute value (double-quoted) state. */
 958                         $chars = $this->stream->charsUntil('"&');
 959
 960                         $last = count($this->token['attr']) - 1;
 961                         $this->token['attr'][$last]['value'] .= $char . $chars;
 962
 963                         $state = 'attributeValueDoubleQuoted';
 964                     }
 965                 break;
 966
 967                 case 'attributeValueSingleQuoted':
 968                     // Consume the next input character:
 969                     $char = $this->stream->char();
 970
 971                     if($char === "'") {
 972                         /* U+0022 QUOTATION MARK (')
 973                         Switch to the after attribute value state. */
 974                         $state = 'afterAttributeValueQuoted';
 975
 976                     } elseif($char === '&') {
 977                         /* U+0026 AMPERSAND (&)
 978                         Switch to the entity in attribute value state. */
 979                         $this->characterReferenceInAttributeValue("'");
 980
 981                     } elseif($char === false) {
 982                         /* EOF
 983                         Parse error. Emit the current tag token. Reconsume the character
 984                         in the data state. */
 985                         $this->emitToken(array(
 986                             'type' => self::PARSEERROR,
 987                             'data' => 'eof-in-attribute-value-single-quote'
 988                         ));
 989                         $this->emitToken($this->token);
 990
 991                         $this->stream->unget();
 992                         $state = 'data';
 993
 994                     } else {
 995                         /* Anything else
 996                         Append the current input character to the current attribute's value.
 997                         Stay in the attribute value (single-quoted) state. */
 998                         $chars = $this->stream->charsUntil("'&");
 999
1000                         $last = count($this->token['attr']) - 1;
1001                         $this->token['attr'][$last]['value'] .= $char . $chars;
1002
1003                         $state = 'attributeValueSingleQuoted';
1004                     }
1005                 break;
1006
1007                 case 'attributeValueUnquoted':
1008                     // Consume the next input character:
1009                     $char = $this->stream->char();
1010
1011                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1012                         /* U+0009 CHARACTER TABULATION
1013                         U+000A LINE FEED (LF)
1014                         U+000C FORM FEED (FF)
1015                         U+0020 SPACE
1016                         Switch to the before attribute name state. */
1017                         $state = 'beforeAttributeName';
1018
1019                     } elseif($char === '&') {
1020                         /* U+0026 AMPERSAND (&)
1021                         Switch to the entity in attribute value state. */
1022                         $this->characterReferenceInAttributeValue();
1023
1024                     } elseif($char === '>') {
1025                         /* U+003E GREATER-THAN SIGN (>)
1026                         Emit the current tag token. Switch to the data state. */
1027                         $this->emitToken($this->token);
1028                         $state = 'data';
1029
1030                     } elseif ($char === false) {
1031                         /* EOF
1032                         Parse error. Emit the current tag token. Reconsume
1033                         the character in the data state. */
1034                         $this->emitToken(array(
1035                             'type' => self::PARSEERROR,
1036                             'data' => 'eof-in-attribute-value-no-quotes'
1037                         ));
1038                         $this->emitToken($this->token);
1039                         $this->stream->unget();
1040                         $state = 'data';
1041
1042                     } else {
1043                         /* U+0022 QUOTATION MARK (")
1044                            U+0027 APOSTROPHE (')
1045                            U+003D EQUALS SIGN (=)
1046                         Parse error. Treat it as per the "anything else"
1047                         entry below. */
1048                         if($char === '"' || $char === "'" || $char === '=') {
1049                             $this->emitToken(array(
1050                                 'type' => self::PARSEERROR,
1051                                 'data' => 'unexpected-character-in-unquoted-attribute-value'
1052                             ));
1053                         }
1054
1055                         /* Anything else
1056                         Append the current input character to the current attribute's value.
1057                         Stay in the attribute value (unquoted) state. */
1058                         $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
1059
1060                         $last = count($this->token['attr']) - 1;
1061                         $this->token['attr'][$last]['value'] .= $char . $chars;
1062
1063                         $state = 'attributeValueUnquoted';
1064                     }
1065                 break;
1066
1067                 case 'afterAttributeValueQuoted':
1068                     /* Consume the next input character: */
1069                     $char = $this->stream->char();
1070
1071                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1072                         /* U+0009 CHARACTER TABULATION
1073                            U+000A LINE FEED (LF)
1074                            U+000C FORM FEED (FF)
1075                            U+0020 SPACE
1076                         Switch to the before attribute name state. */
1077                         $state = 'beforeAttributeName';
1078
1079                     } elseif ($char === '/') {
1080                         /* U+002F SOLIDUS (/)
1081                         Switch to the self-closing start tag state. */
1082                         $state = 'selfClosingStartTag';
1083
1084                     } elseif ($char === '>') {
1085                         /* U+003E GREATER-THAN SIGN (>)
1086                         Emit the current tag token. Switch to the data state. */
1087                         $this->emitToken($this->token);
1088                         $state = 'data';
1089
1090                     } elseif ($char === false) {
1091                         /* EOF
1092                         Parse error. Emit the current tag token. Reconsume the EOF
1093                         character in the data state. */
1094                         $this->emitToken(array(
1095                             'type' => self::PARSEERROR,
1096                             'data' => 'unexpected-EOF-after-attribute-value'
1097                         ));
1098                         $this->emitToken($this->token);
1099                         $this->stream->unget();
1100                         $state = 'data';
1101
1102                     } else {
1103                         /* Anything else
1104                         Parse error. Reconsume the character in the before attribute
1105                         name state. */
1106                         $this->emitToken(array(
1107                             'type' => self::PARSEERROR,
1108                             'data' => 'unexpected-character-after-attribute-value'
1109                         ));
1110                         $this->stream->unget();
1111                         $state = 'beforeAttributeName';
1112                     }
1113                 break;
1114
1115                 case 'selfClosingStartTag':
1116                     /* Consume the next input character: */
1117                     $char = $this->stream->char();
1118
1119                     if ($char === '>') {
1120                         /* U+003E GREATER-THAN SIGN (>)
1121                         Set the self-closing flag of the current tag token.
1122                         Emit the current tag token. Switch to the data state. */
1123                         // not sure if this is the name we want
1124                         $this->token['self-closing'] = true;
1125                         /* When an end tag token is emitted with its self-closing flag set,
1126                         that is a parse error. */
1127                         if ($this->token['type'] === self::ENDTAG) {
1128                             $this->emitToken(array(
1129                                 'type' => self::PARSEERROR,
1130                                 'data' => 'self-closing-end-tag'
1131                             ));
1132                         }
1133                         $this->emitToken($this->token);
1134                         $state = 'data';
1135
1136                     } elseif ($char === false) {
1137                         /* EOF
1138                         Parse error. Emit the current tag token. Reconsume the
1139                         EOF character in the data state. */
1140                         $this->emitToken(array(
1141                             'type' => self::PARSEERROR,
1142                             'data' => 'unexpected-eof-after-self-closing'
1143                         ));
1144                         $this->emitToken($this->token);
1145                         $this->stream->unget();
1146                         $state = 'data';
1147
1148                     } else {
1149                         /* Anything else
1150                         Parse error. Reconsume the character in the before attribute name state. */
1151                         $this->emitToken(array(
1152                             'type' => self::PARSEERROR,
1153                             'data' => 'unexpected-character-after-self-closing'
1154                         ));
1155                         $this->stream->unget();
1156                         $state = 'beforeAttributeName';
1157                     }
1158                 break;
1159
1160                 case 'bogusComment':
1161                     /* (This can only happen if the content model flag is set to the PCDATA state.) */
1162                     /* Consume every character up to the first U+003E GREATER-THAN SIGN
1163                     character (>) or the end of the file (EOF), whichever comes first. Emit
1164                     a comment token whose data is the concatenation of all the characters
1165                     starting from and including the character that caused the state machine
1166                     to switch into the bogus comment state, up to and including the last
1167                     consumed character before the U+003E character, if any, or up to the
1168                     end of the file otherwise. (If the comment was started by the end of
1169                     the file (EOF), the token is empty.) */
1170                     $this->token['data'] .= (string) $this->stream->charsUntil('>');
1171                     $this->stream->char();
1172
1173                     $this->emitToken($this->token);
1174
1175                     /* Switch to the data state. */
1176                     $state = 'data';
1177                 break;
1178
1179                 case 'markupDeclarationOpen':
1180                     // Consume for below
1181                     $hyphens = $this->stream->charsWhile('-', 2);
1182                     if ($hyphens === '-') {
1183                         $this->stream->unget();
1184                     }
1185                     if ($hyphens !== '--') {
1186                         $alpha = $this->stream->charsWhile(self::ALPHA, 7);
1187                     }
1188
1189                     /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1190                     characters, consume those two characters, create a comment token whose
1191                     data is the empty string, and switch to the comment state. */
1192                     if($hyphens === '--') {
1193                         $state = 'commentStart';
1194                         $this->token = array(
1195                             'data' => '',
1196                             'type' => self::COMMENT
1197                         );
1198
1199                     /* Otherwise if the next seven characters are a case-insensitive match
1200                     for the word "DOCTYPE", then consume those characters and switch to the
1201                     DOCTYPE state. */
1202                     } elseif(strtoupper($alpha) === 'DOCTYPE') {
1203                         $state = 'doctype';
1204
1205                     // XXX not implemented
1206                     /* Otherwise, if the insertion mode is "in foreign content"
1207                     and the current node is not an element in the HTML namespace
1208                     and the next seven characters are an ASCII case-sensitive
1209                     match for the string "[CDATA[" (the five uppercase letters
1210                     "CDATA" with a U+005B LEFT SQUARE BRACKET character before
1211                     and after), then consume those characters and switch to the
1212                     CDATA section state (which is unrelated to the content model
1213                     flag's CDATA state). */
1214
1215                     /* Otherwise, is is a parse error. Switch to the bogus comment state.
1216                     The next character that is consumed, if any, is the first character
1217                     that will be in the comment. */
1218                     } else {
1219                         $this->emitToken(array(
1220                             'type' => self::PARSEERROR,
1221                             'data' => 'expected-dashes-or-doctype'
1222                         ));
1223                         $this->token = array(
1224                             'data' => (string) $alpha,
1225                             'type' => self::COMMENT
1226                         );
1227                         $state = 'bogusComment';
1228                     }
1229                 break;
1230
1231                 case 'commentStart':
1232                     /* Consume the next input character: */
1233                     $char = $this->stream->char();
1234
1235                     if ($char === '-') {
1236                         /* U+002D HYPHEN-MINUS (-)
1237                         Switch to the comment start dash state. */
1238                         $state = 'commentStartDash';
1239                     } elseif ($char === '>') {
1240                         /* U+003E GREATER-THAN SIGN (>)
1241                         Parse error. Emit the comment token. Switch to the
1242                         data state. */
1243                         $this->emitToken(array(
1244                             'type' => self::PARSEERROR,
1245                             'data' => 'incorrect-comment'
1246                         ));
1247                         $this->emitToken($this->token);
1248                         $state = 'data';
1249                     } elseif ($char === false) {
1250                         /* EOF
1251                         Parse error. Emit the comment token. Reconsume the
1252                         EOF character in the data state. */
1253                         $this->emitToken(array(
1254                             'type' => self::PARSEERROR,
1255                             'data' => 'eof-in-comment'
1256                         ));
1257                         $this->emitToken($this->token);
1258                         $this->stream->unget();
1259                         $state = 'data';
1260                     } else {
1261                         /* Anything else
1262                         Append the input character to the comment token's
1263                         data. Switch to the comment state. */
1264                         $this->token['data'] .= $char;
1265                         $state = 'comment';
1266                     }
1267                 break;
1268
1269                 case 'commentStartDash':
1270                     /* Consume the next input character: */
1271                     $char = $this->stream->char();
1272                     if ($char === '-') {
1273                         /* U+002D HYPHEN-MINUS (-)
1274                         Switch to the comment end state */
1275                         $state = 'commentEnd';
1276                     } elseif ($char === '>') {
1277                         /* U+003E GREATER-THAN SIGN (>)
1278                         Parse error. Emit the comment token. Switch to the
1279                         data state. */
1280                         $this->emitToken(array(
1281                             'type' => self::PARSEERROR,
1282                             'data' => 'incorrect-comment'
1283                         ));
1284                         $this->emitToken($this->token);
1285                         $state = 'data';
1286                     } elseif ($char === false) {
1287                         /* Parse error. Emit the comment token. Reconsume the
1288                         EOF character in the data state. */
1289                         $this->emitToken(array(
1290                             'type' => self::PARSEERROR,
1291                             'data' => 'eof-in-comment'
1292                         ));
1293                         $this->emitToken($this->token);
1294                         $this->stream->unget();
1295                         $state = 'data';
1296                     } else {
1297                         $this->token['data'] .= '-' . $char;
1298                         $state = 'comment';
1299                     }
1300                 break;
1301
1302                 case 'comment':
1303                     /* Consume the next input character: */
1304                     $char = $this->stream->char();
1305
1306                     if($char === '-') {
1307                         /* U+002D HYPHEN-MINUS (-)
1308                         Switch to the comment end dash state */
1309                         $state = 'commentEndDash';
1310
1311                     } elseif($char === false) {
1312                         /* EOF
1313                         Parse error. Emit the comment token. Reconsume the EOF character
1314                         in the data state. */
1315                         $this->emitToken(array(
1316                             'type' => self::PARSEERROR,
1317                             'data' => 'eof-in-comment'
1318                         ));
1319                         $this->emitToken($this->token);
1320                         $this->stream->unget();
1321                         $state = 'data';
1322
1323                     } else {
1324                         /* Anything else
1325                         Append the input character to the comment token's data. Stay in
1326                         the comment state. */
1327                         $chars = $this->stream->charsUntil('-');
1328
1329                         $this->token['data'] .= $char . $chars;
1330                     }
1331                 break;
1332
1333                 case 'commentEndDash':
1334                     /* Consume the next input character: */
1335                     $char = $this->stream->char();
1336
1337                     if($char === '-') {
1338                         /* U+002D HYPHEN-MINUS (-)
1339                         Switch to the comment end state  */
1340                         $state = 'commentEnd';
1341
1342                     } elseif($char === false) {
1343                         /* EOF
1344                         Parse error. Emit the comment token. Reconsume the EOF character
1345                         in the data state. */
1346                         $this->emitToken(array(
1347                             'type' => self::PARSEERROR,
1348                             'data' => 'eof-in-comment-end-dash'
1349                         ));
1350                         $this->emitToken($this->token);
1351                         $this->stream->unget();
1352                         $state = 'data';
1353
1354                     } else {
1355                         /* Anything else
1356                         Append a U+002D HYPHEN-MINUS (-) character and the input
1357                         character to the comment token's data. Switch to the comment state. */
1358                         $this->token['data'] .= '-'.$char;
1359                         $state = 'comment';
1360                     }
1361                 break;
1362
1363                 case 'commentEnd':
1364                     /* Consume the next input character: */
1365                     $char = $this->stream->char();
1366
1367                     if($char === '>') {
1368                         /* U+003E GREATER-THAN SIGN (>)
1369                         Emit the comment token. Switch to the data state. */
1370                         $this->emitToken($this->token);
1371                         $state = 'data';
1372
1373                     } elseif($char === '-') {
1374                         /* U+002D HYPHEN-MINUS (-)
1375                         Parse error. Append a U+002D HYPHEN-MINUS (-) character
1376                         to the comment token's data. Stay in the comment end
1377                         state. */
1378                         $this->emitToken(array(
1379                             'type' => self::PARSEERROR,
1380                             'data' => 'unexpected-dash-after-double-dash-in-comment'
1381                         ));
1382                         $this->token['data'] .= '-';
1383
1384                     } elseif($char === false) {
1385                         /* EOF
1386                         Parse error. Emit the comment token. Reconsume the
1387                         EOF character in the data state. */
1388                         $this->emitToken(array(
1389                             'type' => self::PARSEERROR,
1390                             'data' => 'eof-in-comment-double-dash'
1391                         ));
1392                         $this->emitToken($this->token);
1393                         $this->stream->unget();
1394                         $state = 'data';
1395
1396                     } else {
1397                         /* Anything else
1398                         Parse error. Append two U+002D HYPHEN-MINUS (-)
1399                         characters and the input character to the comment token's
1400                         data. Switch to the comment state. */
1401                         $this->emitToken(array(
1402                             'type' => self::PARSEERROR,
1403                             'data' => 'unexpected-char-in-comment'
1404                         ));
1405                         $this->token['data'] .= '--'.$char;
1406                         $state = 'comment';
1407                     }
1408                 break;
1409
1410                 case 'doctype':
1411                     /* Consume the next input character: */
1412                     $char = $this->stream->char();
1413
1414                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1415                         /* U+0009 CHARACTER TABULATION
1416                            U+000A LINE FEED (LF)
1417                            U+000C FORM FEED (FF)
1418                            U+0020 SPACE
1419                         Switch to the before DOCTYPE name state. */
1420                         $state = 'beforeDoctypeName';
1421
1422                     } else {
1423                         /* Anything else
1424                         Parse error. Reconsume the current character in the
1425                         before DOCTYPE name state. */
1426                         $this->emitToken(array(
1427                             'type' => self::PARSEERROR,
1428                             'data' => 'need-space-after-doctype'
1429                         ));
1430                         $this->stream->unget();
1431                         $state = 'beforeDoctypeName';
1432                     }
1433                 break;
1434
1435                 case 'beforeDoctypeName':
1436                     /* Consume the next input character: */
1437                     $char = $this->stream->char();
1438
1439                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1440                         /* U+0009 CHARACTER TABULATION
1441                            U+000A LINE FEED (LF)
1442                            U+000C FORM FEED (FF)
1443                            U+0020 SPACE
1444                         Stay in the before DOCTYPE name state. */
1445
1446                     } elseif($char === '>') {
1447                         /* U+003E GREATER-THAN SIGN (>)
1448                         Parse error. Create a new DOCTYPE token. Set its
1449                         force-quirks flag to on. Emit the token. Switch to the
1450                         data state. */
1451                         $this->emitToken(array(
1452                             'type' => self::PARSEERROR,
1453                             'data' => 'expected-doctype-name-but-got-right-bracket'
1454                         ));
1455                         $this->emitToken(array(
1456                             'name' => '',
1457                             'type' => self::DOCTYPE,
1458                             'force-quirks' => true,
1459                             'error' => true
1460                         ));
1461
1462                         $state = 'data';
1463
1464                     } elseif('A' <= $char && $char <= 'Z') {
1465                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1466                         Create a new DOCTYPE token. Set the token's name to the
1467                         lowercase version of the input character (add 0x0020 to
1468                         the character's code point). Switch to the DOCTYPE name
1469                         state. */
1470                         $this->token = array(
1471                             'name' => strtolower($char),
1472                             'type' => self::DOCTYPE,
1473                             'error' => true
1474                         );
1475
1476                         $state = 'doctypeName';
1477
1478                     } elseif($char === false) {
1479                         /* EOF
1480                         Parse error. Create a new DOCTYPE token. Set its
1481                         force-quirks flag to on. Emit the token. Reconsume the
1482                         EOF character in the data state. */
1483                         $this->emitToken(array(
1484                             'type' => self::PARSEERROR,
1485                             'data' => 'expected-doctype-name-but-got-eof'
1486                         ));
1487                         $this->emitToken(array(
1488                             'name' => '',
1489                             'type' => self::DOCTYPE,
1490                             'force-quirks' => true,
1491                             'error' => true
1492                         ));
1493
1494                         $this->stream->unget();
1495                         $state = 'data';
1496
1497                     } else {
1498                         /* Anything else
1499                         Create a new DOCTYPE token. Set the token's name to the
1500                         current input character. Switch to the DOCTYPE name state. */
1501                         $this->token = array(
1502                             'name' => $char,
1503                             'type' => self::DOCTYPE,
1504                             'error' => true
1505                         );
1506
1507                         $state = 'doctypeName';
1508                     }
1509                 break;
1510
1511                 case 'doctypeName':
1512                     /* Consume the next input character: */
1513                     $char = $this->stream->char();
1514
1515                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1516                         /* U+0009 CHARACTER TABULATION
1517                            U+000A LINE FEED (LF)
1518                            U+000C FORM FEED (FF)
1519                            U+0020 SPACE
1520                         Switch to the after DOCTYPE name state. */
1521                         $state = 'afterDoctypeName';
1522
1523                     } elseif($char === '>') {
1524                         /* U+003E GREATER-THAN SIGN (>)
1525                         Emit the current DOCTYPE token. Switch to the data state. */
1526                         $this->emitToken($this->token);
1527                         $state = 'data';
1528
1529                     } elseif('A' <= $char && $char <= 'Z') {
1530                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1531                         Append the lowercase version of the input character
1532                         (add 0x0020 to the character's code point) to the current
1533                         DOCTYPE token's name. Stay in the DOCTYPE name state. */
1534                         $this->token['name'] .= strtolower($char);
1535
1536                     } elseif($char === false) {
1537                         /* EOF
1538                         Parse error. Set the DOCTYPE token's force-quirks flag
1539                         to on. Emit that DOCTYPE token. Reconsume the EOF
1540                         character in the data state. */
1541                         $this->emitToken(array(
1542                             'type' => self::PARSEERROR,
1543                             'data' => 'eof-in-doctype-name'
1544                         ));
1545                         $this->token['force-quirks'] = true;
1546                         $this->emitToken($this->token);
1547                         $this->stream->unget();
1548                         $state = 'data';
1549
1550                     } else {
1551                         /* Anything else
1552                         Append the current input character to the current
1553                         DOCTYPE token's name. Stay in the DOCTYPE name state. */
1554                         $this->token['name'] .= $char;
1555                     }
1556
1557                     // XXX this is probably some sort of quirks mode designation,
1558                     // check tree-builder to be sure. In general 'error' needs
1559                     // to be specc'ified, this probably means removing it at the end
1560                     $this->token['error'] = ($this->token['name'] === 'HTML')
1561                         ? false
1562                         : true;
1563                 break;
1564
1565                 case 'afterDoctypeName':
1566                     /* Consume the next input character: */
1567                     $char = $this->stream->char();
1568
1569                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1570                         /* U+0009 CHARACTER TABULATION
1571                            U+000A LINE FEED (LF)
1572                            U+000C FORM FEED (FF)
1573                            U+0020 SPACE
1574                         Stay in the after DOCTYPE name state. */
1575
1576                     } elseif($char === '>') {
1577                         /* U+003E GREATER-THAN SIGN (>)
1578                         Emit the current DOCTYPE token. Switch to the data state. */
1579                         $this->emitToken($this->token);
1580                         $state = 'data';
1581
1582                     } elseif($char === false) {
1583                         /* EOF
1584                         Parse error. Set the DOCTYPE token's force-quirks flag
1585                         to on. Emit that DOCTYPE token. Reconsume the EOF
1586                         character in the data state. */
1587                         $this->emitToken(array(
1588                             'type' => self::PARSEERROR,
1589                             'data' => 'eof-in-doctype'
1590                         ));
1591                         $this->token['force-quirks'] = true;
1592                         $this->emitToken($this->token);
1593                         $this->stream->unget();
1594                         $state = 'data';
1595
1596                     } else {
1597                         /* Anything else */
1598
1599                         $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
1600                         if ($nextSix === 'PUBLIC') {
1601                             /* If the next six characters are an ASCII
1602                             case-insensitive match for the word "PUBLIC", then
1603                             consume those characters and switch to the before
1604                             DOCTYPE public identifier state. */
1605                             $state = 'beforeDoctypePublicIdentifier';
1606
1607                         } elseif ($nextSix === 'SYSTEM') {
1608                             /* Otherwise, if the next six characters are an ASCII
1609                             case-insensitive match for the word "SYSTEM", then
1610                             consume those characters and switch to the before
1611                             DOCTYPE system identifier state. */
1612                             $state = 'beforeDoctypeSystemIdentifier';
1613
1614                         } else {
1615                             /* Otherwise, this is the parse error. Set the DOCTYPE
1616                             token's force-quirks flag to on. Switch to the bogus
1617                             DOCTYPE state. */
1618                             $this->emitToken(array(
1619                                 'type' => self::PARSEERROR,
1620                                 'data' => 'expected-space-or-right-bracket-in-doctype'
1621                             ));
1622                             $this->token['force-quirks'] = true;
1623                             $this->token['error'] = true;
1624                             $state = 'bogusDoctype';
1625                         }
1626                     }
1627                 break;
1628
1629                 case 'beforeDoctypePublicIdentifier':
1630                     /* Consume the next input character: */
1631                     $char = $this->stream->char();
1632
1633                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1634                         /* U+0009 CHARACTER TABULATION
1635                            U+000A LINE FEED (LF)
1636                            U+000C FORM FEED (FF)
1637                            U+0020 SPACE
1638                         Stay in the before DOCTYPE public identifier state. */
1639                     } elseif ($char === '"') {
1640                         /* U+0022 QUOTATION MARK (")
1641                         Set the DOCTYPE token's public identifier to the empty
1642                         string (not missing), then switch to the DOCTYPE public
1643                         identifier (double-quoted) state. */
1644                         $this->token['public'] = '';
1645                         $state = 'doctypePublicIdentifierDoubleQuoted';
1646                     } elseif ($char === "'") {
1647                         /* U+0027 APOSTROPHE (')
1648                         Set the DOCTYPE token's public identifier to the empty
1649                         string (not missing), then switch to the DOCTYPE public
1650                         identifier (single-quoted) state. */
1651                         $this->token['public'] = '';
1652                         $state = 'doctypePublicIdentifierSingleQuoted';
1653                     } elseif ($char === '>') {
1654                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1655                         to on. Emit that DOCTYPE token. Switch to the data state. */
1656                         $this->emitToken(array(
1657                             'type' => self::PARSEERROR,
1658                             'data' => 'unexpected-end-of-doctype'
1659                         ));
1660                         $this->token['force-quirks'] = true;
1661                         $this->emitToken($this->token);
1662                         $state = 'data';
1663                     } elseif ($char === false) {
1664                         /* Parse error. Set the DOCTYPE token's force-quirks
1665                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1666                         character in the data state. */
1667                         $this->emitToken(array(
1668                             'type' => self::PARSEERROR,
1669                             'data' => 'eof-in-doctype'
1670                         ));
1671                         $this->token['force-quirks'] = true;
1672                         $this->emitToken($this->token);
1673                         $this->stream->unget();
1674                         $state = 'data';
1675                     } else {
1676                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1677                         to on. Switch to the bogus DOCTYPE state. */
1678                         $this->emitToken(array(
1679                             'type' => self::PARSEERROR,
1680                             'data' => 'unexpected-char-in-doctype'
1681                         ));
1682                         $this->token['force-quirks'] = true;
1683                         $state = 'bogusDoctype';
1684                     }
1685                 break;
1686
1687                 case 'doctypePublicIdentifierDoubleQuoted':
1688                     /* Consume the next input character: */
1689                     $char = $this->stream->char();
1690
1691                     if ($char === '"') {
1692                         /* U+0022 QUOTATION MARK (")
1693                         Switch to the after DOCTYPE public identifier state. */
1694                         $state = 'afterDoctypePublicIdentifier';
1695                     } elseif ($char === '>') {
1696                         /* U+003E GREATER-THAN SIGN (>)
1697                         Parse error. Set the DOCTYPE token's force-quirks flag
1698                         to on. Emit that DOCTYPE token. Switch to the data state. */
1699                         $this->emitToken(array(
1700                             'type' => self::PARSEERROR,
1701                             'data' => 'unexpected-end-of-doctype'
1702                         ));
1703                         $this->token['force-quirks'] = true;
1704                         $this->emitToken($this->token);
1705                         $state = 'data';
1706                     } elseif ($char === false) {
1707                         /* EOF
1708                         Parse error. Set the DOCTYPE token's force-quirks flag
1709                         to on. Emit that DOCTYPE token. Reconsume the EOF
1710                         character in the data state. */
1711                         $this->emitToken(array(
1712                             'type' => self::PARSEERROR,
1713                             'data' => 'eof-in-doctype'
1714                         ));
1715                         $this->token['force-quirks'] = true;
1716                         $this->emitToken($this->token);
1717                         $this->stream->unget();
1718                         $state = 'data';
1719                     } else {
1720                         /* Anything else
1721                         Append the current input character to the current
1722                         DOCTYPE token's public identifier. Stay in the DOCTYPE
1723                         public identifier (double-quoted) state. */
1724                         $this->token['public'] .= $char;
1725                     }
1726                 break;
1727
1728                 case 'doctypePublicIdentifierSingleQuoted':
1729                     /* Consume the next input character: */
1730                     $char = $this->stream->char();
1731
1732                     if ($char === "'") {
1733                         /* U+0027 APOSTROPHE (')
1734                         Switch to the after DOCTYPE public identifier state. */
1735                         $state = 'afterDoctypePublicIdentifier';
1736                     } elseif ($char === '>') {
1737                         /* U+003E GREATER-THAN SIGN (>)
1738                         Parse error. Set the DOCTYPE token's force-quirks flag
1739                         to on. Emit that DOCTYPE token. Switch to the data state. */
1740                         $this->emitToken(array(
1741                             'type' => self::PARSEERROR,
1742                             'data' => 'unexpected-end-of-doctype'
1743                         ));
1744                         $this->token['force-quirks'] = true;
1745                         $this->emitToken($this->token);
1746                         $state = 'data';
1747                     } elseif ($char === false) {
1748                         /* EOF
1749                         Parse error. Set the DOCTYPE token's force-quirks flag
1750                         to on. Emit that DOCTYPE token. Reconsume the EOF
1751                         character in the data state. */
1752                         $this->emitToken(array(
1753                             'type' => self::PARSEERROR,
1754                             'data' => 'eof-in-doctype'
1755                         ));
1756                         $this->token['force-quirks'] = true;
1757                         $this->emitToken($this->token);
1758                         $this->stream->unget();
1759                         $state = 'data';
1760                     } else {
1761                         /* Anything else
1762                         Append the current input character to the current
1763                         DOCTYPE token's public identifier. Stay in the DOCTYPE
1764                         public identifier (double-quoted) state. */
1765                         $this->token['public'] .= $char;
1766                     }
1767                 break;
1768
1769                 case 'afterDoctypePublicIdentifier':
1770                     /* Consume the next input character: */
1771                     $char = $this->stream->char();
1772
1773                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1774                         /* U+0009 CHARACTER TABULATION
1775                            U+000A LINE FEED (LF)
1776                            U+000C FORM FEED (FF)
1777                            U+0020 SPACE
1778                         Stay in the after DOCTYPE public identifier state. */
1779                     } elseif ($char === '"') {
1780                         /* U+0022 QUOTATION MARK (")
1781                         Set the DOCTYPE token's system identifier to the
1782                         empty string (not missing), then switch to the DOCTYPE
1783                         system identifier (double-quoted) state. */
1784                         $this->token['system'] = '';
1785                         $state = 'doctypeSystemIdentifierDoubleQuoted';
1786                     } elseif ($char === "'") {
1787                         /* U+0027 APOSTROPHE (')
1788                         Set the DOCTYPE token's system identifier to the
1789                         empty string (not missing), then switch to the DOCTYPE
1790                         system identifier (single-quoted) state. */
1791                         $this->token['system'] = '';
1792                         $state = 'doctypeSystemIdentifierSingleQuoted';
1793                     } elseif ($char === '>') {
1794                         /* U+003E GREATER-THAN SIGN (>)
1795                         Emit the current DOCTYPE token. Switch to the data state. */
1796                         $this->emitToken($this->token);
1797                         $state = 'data';
1798                     } elseif ($char === false) {
1799                         /* Parse error. Set the DOCTYPE token's force-quirks
1800                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1801                         character in the data state. */
1802                         $this->emitToken(array(
1803                             'type' => self::PARSEERROR,
1804                             'data' => 'eof-in-doctype'
1805                         ));
1806                         $this->token['force-quirks'] = true;
1807                         $this->emitToken($this->token);
1808                         $this->stream->unget();
1809                         $state = 'data';
1810                     } else {
1811                         /* Anything else
1812                         Parse error. Set the DOCTYPE token's force-quirks flag
1813                         to on. Switch to the bogus DOCTYPE state. */
1814                         $this->emitToken(array(
1815                             'type' => self::PARSEERROR,
1816                             'data' => 'unexpected-char-in-doctype'
1817                         ));
1818                         $this->token['force-quirks'] = true;
1819                         $state = 'bogusDoctype';
1820                     }
1821                 break;
1822
1823                 case 'beforeDoctypeSystemIdentifier':
1824                     /* Consume the next input character: */
1825                     $char = $this->stream->char();
1826
1827                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1828                         /* U+0009 CHARACTER TABULATION
1829                            U+000A LINE FEED (LF)
1830                            U+000C FORM FEED (FF)
1831                            U+0020 SPACE
1832                         Stay in the before DOCTYPE system identifier state. */
1833                     } elseif ($char === '"') {
1834                         /* U+0022 QUOTATION MARK (")
1835                         Set the DOCTYPE token's system identifier to the empty
1836                         string (not missing), then switch to the DOCTYPE system
1837                         identifier (double-quoted) state. */
1838                         $this->token['system'] = '';
1839                         $state = 'doctypeSystemIdentifierDoubleQuoted';
1840                     } elseif ($char === "'") {
1841                         /* U+0027 APOSTROPHE (')
1842                         Set the DOCTYPE token's system identifier to the empty
1843                         string (not missing), then switch to the DOCTYPE system
1844                         identifier (single-quoted) state. */
1845                         $this->token['system'] = '';
1846                         $state = 'doctypeSystemIdentifierSingleQuoted';
1847                     } elseif ($char === '>') {
1848                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1849                         to on. Emit that DOCTYPE token. Switch to the data state. */
1850                         $this->emitToken(array(
1851                             'type' => self::PARSEERROR,
1852                             'data' => 'unexpected-char-in-doctype'
1853                         ));
1854                         $this->token['force-quirks'] = true;
1855                         $this->emitToken($this->token);
1856                         $state = 'data';
1857                     } elseif ($char === false) {
1858                         /* Parse error. Set the DOCTYPE token's force-quirks
1859                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1860                         character in the data state. */
1861                         $this->emitToken(array(
1862                             'type' => self::PARSEERROR,
1863                             'data' => 'eof-in-doctype'
1864                         ));
1865                         $this->token['force-quirks'] = true;
1866                         $this->emitToken($this->token);
1867                         $this->stream->unget();
1868                         $state = 'data';
1869                     } else {
1870                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1871                         to on. Switch to the bogus DOCTYPE state. */
1872                         $this->emitToken(array(
1873                             'type' => self::PARSEERROR,
1874                             'data' => 'unexpected-char-in-doctype'
1875                         ));
1876                         $this->token['force-quirks'] = true;
1877                         $state = 'bogusDoctype';
1878                     }
1879                 break;
1880
1881                 case 'doctypeSystemIdentifierDoubleQuoted':
1882                     /* Consume the next input character: */
1883                     $char = $this->stream->char();
1884
1885                     if ($char === '"') {
1886                         /* U+0022 QUOTATION MARK (")
1887                         Switch to the after DOCTYPE system identifier state. */
1888                         $state = 'afterDoctypeSystemIdentifier';
1889                     } elseif ($char === '>') {
1890                         /* U+003E GREATER-THAN SIGN (>)
1891                         Parse error. Set the DOCTYPE token's force-quirks flag
1892                         to on. Emit that DOCTYPE token. Switch to the data state. */
1893                         $this->emitToken(array(
1894                             'type' => self::PARSEERROR,
1895                             'data' => 'unexpected-end-of-doctype'
1896                         ));
1897                         $this->token['force-quirks'] = true;
1898                         $this->emitToken($this->token);
1899                         $state = 'data';
1900                     } elseif ($char === false) {
1901                         /* EOF
1902                         Parse error. Set the DOCTYPE token's force-quirks flag
1903                         to on. Emit that DOCTYPE token. Reconsume the EOF
1904                         character in the data state. */
1905                         $this->emitToken(array(
1906                             'type' => self::PARSEERROR,
1907                             'data' => 'eof-in-doctype'
1908                         ));
1909                         $this->token['force-quirks'] = true;
1910                         $this->emitToken($this->token);
1911                         $this->stream->unget();
1912                         $state = 'data';
1913                     } else {
1914                         /* Anything else
1915                         Append the current input character to the current
1916                         DOCTYPE token's system identifier. Stay in the DOCTYPE
1917                         system identifier (double-quoted) state. */
1918                         $this->token['system'] .= $char;
1919                     }
1920                 break;
1921
1922                 case 'doctypeSystemIdentifierSingleQuoted':
1923                     /* Consume the next input character: */
1924                     $char = $this->stream->char();
1925
1926                     if ($char === "'") {
1927                         /* U+0027 APOSTROPHE (')
1928                         Switch to the after DOCTYPE system identifier state. */
1929                         $state = 'afterDoctypeSystemIdentifier';
1930                     } elseif ($char === '>') {
1931                         /* U+003E GREATER-THAN SIGN (>)
1932                         Parse error. Set the DOCTYPE token's force-quirks flag
1933                         to on. Emit that DOCTYPE token. Switch to the data state. */
1934                         $this->emitToken(array(
1935                             'type' => self::PARSEERROR,
1936                             'data' => 'unexpected-end-of-doctype'
1937                         ));
1938                         $this->token['force-quirks'] = true;
1939                         $this->emitToken($this->token);
1940                         $state = 'data';
1941                     } elseif ($char === false) {
1942                         /* EOF
1943                         Parse error. Set the DOCTYPE token's force-quirks flag
1944                         to on. Emit that DOCTYPE token. Reconsume the EOF
1945                         character in the data state. */
1946                         $this->emitToken(array(
1947                             'type' => self::PARSEERROR,
1948                             'data' => 'eof-in-doctype'
1949                         ));
1950                         $this->token['force-quirks'] = true;
1951                         $this->emitToken($this->token);
1952                         $this->stream->unget();
1953                         $state = 'data';
1954                     } else {
1955                         /* Anything else
1956                         Append the current input character to the current
1957                         DOCTYPE token's system identifier. Stay in the DOCTYPE
1958                         system identifier (double-quoted) state. */
1959                         $this->token['system'] .= $char;
1960                     }
1961                 break;
1962
1963                 case 'afterDoctypeSystemIdentifier':
1964                     /* Consume the next input character: */
1965                     $char = $this->stream->char();
1966
1967                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1968                         /* U+0009 CHARACTER TABULATION
1969                            U+000A LINE FEED (LF)
1970                            U+000C FORM FEED (FF)
1971                            U+0020 SPACE
1972                         Stay in the after DOCTYPE system identifier state. */
1973                     } elseif ($char === '>') {
1974                         /* U+003E GREATER-THAN SIGN (>)
1975                         Emit the current DOCTYPE token. Switch to the data state. */
1976                         $this->emitToken($this->token);
1977                         $state = 'data';
1978                     } elseif ($char === false) {
1979                         /* Parse error. Set the DOCTYPE token's force-quirks
1980                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1981                         character in the data state. */
1982                         $this->emitToken(array(
1983                             'type' => self::PARSEERROR,
1984                             'data' => 'eof-in-doctype'
1985                         ));
1986                         $this->token['force-quirks'] = true;
1987                         $this->emitToken($this->token);
1988                         $this->stream->unget();
1989                         $state = 'data';
1990                     } else {
1991                         /* Anything else
1992                         Parse error. Switch to the bogus DOCTYPE state.
1993                         (This does not set the DOCTYPE token's force-quirks
1994                         flag to on.) */
1995                         $this->emitToken(array(
1996                             'type' => self::PARSEERROR,
1997                             'data' => 'unexpected-char-in-doctype'
1998                         ));
1999                         $state = 'bogusDoctype';
2000                     }
2001                 break;
2002
2003                 case 'bogusDoctype':
2004                     /* Consume the next input character: */
2005                     $char = $this->stream->char();
2006
2007                     if ($char === '>') {
2008                         /* U+003E GREATER-THAN SIGN (>)
2009                         Emit the DOCTYPE token. Switch to the data state. */
2010                         $this->emitToken($this->token);
2011                         $state = 'data';
2012
2013                     } elseif($char === false) {
2014                         /* EOF
2015                         Emit the DOCTYPE token. Reconsume the EOF character in
2016                         the data state. */
2017                         $this->emitToken($this->token);
2018                         $this->stream->unget();
2019                         $state = 'data';
2020
2021                     } else {
2022                         /* Anything else
2023                         Stay in the bogus DOCTYPE state. */
2024                     }
2025                 break;
2026
2027                 // case 'cdataSection':
2028
2029             }
2030         }
2031     }
2032
2033     /**
2034      * Returns a serialized representation of the tree.
2035      */
2036     public function save() {
2037         return $this->tree->save();
2038     }
2039
2040     /**
2041      * Returns the input stream.
2042      */
2043     public function stream() {
2044         return $this->stream;
2045     }
2046
2047     private function consumeCharacterReference($allowed = false, $inattr = false) {
2048         // This goes quite far against spec, and is far closer to the Python
2049         // impl., mainly because we don't do the large unconsuming the spec
2050         // requires.
2051
2052         // All consumed characters.
2053         $chars = $this->stream->char();
2054
2055         /* This section defines how to consume a character
2056         reference. This definition is used when parsing character
2057         references in text and in attributes.
2058
2059         The behavior depends on the identity of the next character
2060         (the one immediately after the U+0026 AMPERSAND character): */
2061
2062         if (
2063             $chars[0] === "\x09" ||
2064             $chars[0] === "\x0A" ||
2065             $chars[0] === "\x0C" ||
2066             $chars[0] === "\x20" ||
2067             $chars[0] === '<' ||
2068             $chars[0] === '&' ||
2069             $chars === false ||
2070             $chars[0] === $allowed
2071         ) {
2072             /* U+0009 CHARACTER TABULATION
2073                U+000A LINE FEED (LF)
2074                U+000C FORM FEED (FF)
2075                U+0020 SPACE
2076                U+003C LESS-THAN SIGN
2077                U+0026 AMPERSAND
2078                EOF
2079                The additional allowed character, if there is one
2080             Not a character reference. No characters are consumed,
2081             and nothing is returned. (This is not an error, either.) */
2082             // We already consumed, so unconsume.
2083             $this->stream->unget();
2084             return '&';
2085         } elseif ($chars[0] === '#') {
2086             /* Consume the U+0023 NUMBER SIGN. */
2087             // Um, yeah, we already did that.
2088             /* The behavior further depends on the character after
2089             the U+0023 NUMBER SIGN: */
2090             $chars .= $this->stream->char();
2091             if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
2092                 /* U+0078 LATIN SMALL LETTER X
2093                    U+0058 LATIN CAPITAL LETTER X */
2094                 /* Consume the X. */
2095                 // Um, yeah, we already did that.
2096                 /* Follow the steps below, but using the range of
2097                 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2098                 NINE, U+0061 LATIN SMALL LETTER A through to U+0066
2099                 LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
2100                 A, through to U+0046 LATIN CAPITAL LETTER F (in other
2101                 words, 0123456789, ABCDEF, abcdef). */
2102                 $char_class = self::HEX;
2103                 /* When it comes to interpreting the
2104                 number, interpret it as a hexadecimal number. */
2105                 $hex = true;
2106             } else {
2107                 /* Anything else */
2108                 // Unconsume because we shouldn't have consumed this.
2109                 $chars = $chars[0];
2110                 $this->stream->unget();
2111                 /* Follow the steps below, but using the range of
2112                 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2113                 NINE (i.e. just 0123456789). */
2114                 $char_class = self::DIGIT;
2115                 /* When it comes to interpreting the number,
2116                 interpret it as a decimal number. */
2117                 $hex = false;
2118             }
2119
2120             /* Consume as many characters as match the range of characters given above. */
2121             $consumed = $this->stream->charsWhile($char_class);
2122             if ($consumed === '' || $consumed === false) {
2123                 /* If no characters match the range, then don't consume
2124                 any characters (and unconsume the U+0023 NUMBER SIGN
2125                 character and, if appropriate, the X character). This
2126                 is a parse error; nothing is returned. */
2127                 $this->emitToken(array(
2128                     'type' => self::PARSEERROR,
2129                     'data' => 'expected-numeric-entity'
2130                 ));
2131                 return '&' . $chars;
2132             } else {
2133                 /* Otherwise, if the next character is a U+003B SEMICOLON,
2134                 consume that too. If it isn't, there is a parse error. */
2135                 if ($this->stream->char() !== ';') {
2136                     $this->stream->unget();
2137                     $this->emitToken(array(
2138                         'type' => self::PARSEERROR,
2139                         'data' => 'numeric-entity-without-semicolon'
2140                     ));
2141                 }
2142
2143                 /* If one or more characters match the range, then take
2144                 them all and interpret the string of characters as a number
2145                 (either hexadecimal or decimal as appropriate). */
2146                 $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
2147
2148                 /* If that number is one of the numbers in the first column
2149                 of the following table, then this is a parse error. Find the
2150                 row with that number in the first column, and return a
2151                 character token for the Unicode character given in the
2152                 second column of that row. */
2153                 $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
2154                 if ($new_codepoint) {
2155                     $this->emitToken(array(
2156                         'type' => self::PARSEERROR,
2157                         'data' => 'illegal-windows-1252-entity'
2158                     ));
2159                     $codepoint = $new_codepoint;
2160                 } else {
2161                     /* Otherwise, if the number is in the range 0x0000 to 0x0008,
2162                     U+000B,  U+000E to 0x001F,  0x007F  to 0x009F, 0xD800 to 0xDFFF ,
2163                     0xFDD0 to 0xFDEF, or is one of 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF,
2164                     0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
2165                     0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF,
2166                     0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE,
2167                     0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
2168                     0x10FFFE, or 0x10FFFF, or is higher than 0x10FFFF, then this
2169                     is a parse error; return a character token for the U+FFFD
2170                     REPLACEMENT CHARACTER character instead. */
2171                     // && has higher precedence than ||
2172                     if (
2173                         $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
2174                         $codepoint === 0x000B ||
2175                         $codepoint >= 0x000E && $codepoint <= 0x001F ||
2176                         $codepoint >= 0x007F && $codepoint <= 0x009F ||
2177                         $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
2178                         $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
2179                         ($codepoint & 0xFFFE) === 0xFFFE ||
2180                         $codepoint > 0x10FFFF
2181                     ) {
2182                         $this->emitToken(array(
2183                             'type' => self::PARSEERROR,
2184                             'data' => 'illegal-codepoint-for-numeric-entity'
2185                         ));
2186                         $codepoint = 0xFFFD;
2187                     }
2188                 }
2189
2190                 /* Otherwise, return a character token for the Unicode
2191                 character whose code point is that number. */
2192                 return HTML5_Data::utf8chr($codepoint);
2193             }
2194
2195         } else {
2196             /* Anything else */
2197
2198             /* Consume the maximum number of characters possible,
2199             with the consumed characters matching one of the
2200             identifiers in the first column of the named character
2201             references table (in a case-sensitive manner). */
2202
2203             // we will implement this by matching the longest
2204             // alphanumeric + semicolon string, and then working
2205             // our way backwards
2206             $chars .= $this->stream->charsWhile(self::DIGIT . self::ALPHA . ';', HTML5_Data::getNamedCharacterReferenceMaxLength() - 1);
2207             $len = strlen($chars);
2208
2209             $refs = HTML5_Data::getNamedCharacterReferences();
2210             $codepoint = false;
2211             for($c = $len; $c > 0; $c--) {
2212                 $id = substr($chars, 0, $c);
2213                 if(isset($refs[$id])) {
2214                     $codepoint = $refs[$id];
2215                     break;
2216                 }
2217             }
2218
2219             /* If no match can be made, then this is a parse error.
2220             No characters are consumed, and nothing is returned. */
2221             if (!$codepoint) {
2222                 $this->emitToken(array(
2223                     'type' => self::PARSEERROR,
2224                     'data' => 'expected-named-entity'
2225                 ));
2226                 return '&' . $chars;
2227             }
2228
2229             /* If the last character matched is not a U+003B SEMICOLON
2230             (;), there is a parse error. */
2231             $semicolon = true;
2232             if (substr($id, -1) !== ';') {
2233                 $this->emitToken(array(
2234                     'type' => self::PARSEERROR,
2235                     'data' => 'named-entity-without-semicolon'
2236                 ));
2237                 $semicolon = false;
2238             }
2239
2240
2241             /* If the character reference is being consumed as part of
2242             an attribute, and the last character matched is not a
2243             U+003B SEMICOLON (;), and the next character is in the
2244             range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
2245             LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
2246             or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
2247             then, for historical reasons, all the characters that were
2248             matched after the U+0026 AMPERSAND (&) must be unconsumed,
2249             and nothing is returned. */
2250             if (
2251                 $inattr && !$semicolon &&
2252                 strspn(substr($chars, $c, 1), self::ALPHA . self::DIGIT)
2253             ) {
2254                 return '&' . $chars;
2255             }
2256
2257             /* Otherwise, return a character token for the character
2258             corresponding to the character reference name (as given
2259             by the second column of the named character references table). */
2260             return HTML5_Data::utf8chr($codepoint) . substr($chars, $c);
2261         }
2262     }
2263
2264     private function characterReferenceInAttributeValue($allowed = false) {
2265         /* Attempt to consume a character reference. */
2266         $entity = $this->consumeCharacterReference($allowed, true);
2267
2268         /* If nothing is returned, append a U+0026 AMPERSAND
2269         character to the current attribute's value.
2270
2271         Otherwise, append the returned character token to the
2272         current attribute's value. */
2273         $char = (!$entity)
2274             ? '&'
2275             : $entity;
2276
2277         $last = count($this->token['attr']) - 1;
2278         $this->token['attr'][$last]['value'] .= $char;
2279
2280         /* Finally, switch back to the attribute value state that you
2281         were in when were switched into this state. */
2282     }
2283
2284     /**
2285      * Emits a token, passing it on to the tree builder.
2286      */
2287     protected function emitToken($token, $checkStream = true) {
2288         if ($checkStream) {
2289             // Emit errors from input stream.
2290             while ($this->stream->errors) {
2291                 $this->emitToken(array_shift($this->stream->errors), false);
2292             }
2293         }
2294
2295         // the current structure of attributes is not a terribly good one
2296         $this->tree->emitToken($token);
2297
2298         if(is_int($this->tree->content_model)) {
2299             $this->content_model = $this->tree->content_model;
2300             $this->tree->content_model = null;
2301
2302         } elseif($token['type'] === self::ENDTAG) {
2303             $this->content_model = self::PCDATA;
2304         }
2305     }
2306 }
2307