]> git.mxchange.org Git - friendica.git/blob - library/HTML5/Tokenizer.php
more friend suggestions
[friendica.git] / library / HTML5 / Tokenizer.php
1 <?php
2
3 /*
4
5 Copyright 2007 Jeroen van der Meer <http://jero.net/>
6 Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
7 Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
8
9 Permission is hereby granted, free of charge, to any person obtaining a
10 copy of this software and associated documentation files (the
11 "Software"), to deal in the Software without restriction, including
12 without limitation the rights to use, copy, modify, merge, publish,
13 distribute, sublicense, and/or sell copies of the Software, and to
14 permit persons to whom the Software is furnished to do so, subject to
15 the following conditions:
16
17 The above copyright notice and this permission notice shall be included
18 in all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
28 */
29
30 // Some conventions:
31 // /* */ indicates verbatim text from the HTML 5 specification
32 // // indicates regular comments
33
34 // all flags are in hyphenated form
35
36 class HTML5_Tokenizer {
37     /**
38      * Points to an InputStream object.
39      */
40     protected $stream;
41
42     /**
43      * Tree builder that the tokenizer emits token to.
44      */
45     private $tree;
46
47     /**
48      * Current content model we are parsing as.
49      */
50     protected $content_model;
51
52     /**
53      * Current token that is being built, but not yet emitted. Also
54      * is the last token emitted, if applicable.
55      */
56     protected $token;
57
58     // These are constants describing the content model
59     const PCDATA    = 0;
60     const RCDATA    = 1;
61     const CDATA     = 2;
62     const PLAINTEXT = 3;
63
64     // These are constants describing tokens
65     // XXX should probably be moved somewhere else, probably the
66     // HTML5 class.
67     const DOCTYPE        = 0;
68     const STARTTAG       = 1;
69     const ENDTAG         = 2;
70     const COMMENT        = 3;
71     const CHARACTER      = 4;
72     const SPACECHARACTER = 5;
73     const EOF            = 6;
74     const PARSEERROR     = 7;
75
76     // These are constants representing bunches of characters.
77     const ALPHA       = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
78     const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
79     const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
80     const DIGIT       = '0123456789';
81     const HEX         = '0123456789ABCDEFabcdef';
82     const WHITESPACE  = "\t\n\x0c ";
83
84     /**
85      * @param $data Data to parse
86      */
87     public function __construct($data, $builder = null) {
88         $this->stream = new HTML5_InputStream($data);
89         if (!$builder) $this->tree = new HTML5_TreeBuilder;
90         $this->content_model = self::PCDATA;
91     }
92
93     public function parseFragment($context = null) {
94         $this->tree->setupContext($context);
95         if ($this->tree->content_model) {
96             $this->content_model = $this->tree->content_model;
97             $this->tree->content_model = null;
98         }
99         $this->parse();
100     }
101
102     // XXX maybe convert this into an iterator? regardless, this function
103     // and the save function should go into a Parser facade of some sort
104     /**
105      * Performs the actual parsing of the document.
106      */
107     public function parse() {
108         // Current state
109         $state = 'data';
110         // This is used to avoid having to have look-behind in the data state.
111         $lastFourChars = '';
112         /**
113          * Escape flag as specified by the HTML5 specification: "used to
114          * control the behavior of the tokeniser. It is either true or
115          * false, and initially must be set to the false state."
116          */
117         $escape = false;
118         //echo "\n\n";
119         while($state !== null) {
120             
121             /*echo $state . ' ';
122             switch ($this->content_model) {
123                 case self::PCDATA: echo 'PCDATA'; break;
124                 case self::RCDATA: echo 'RCDATA'; break;
125                 case self::CDATA: echo 'CDATA'; break;
126                 case self::PLAINTEXT: echo 'PLAINTEXT'; break;
127             }
128             if ($escape) echo " escape";
129             echo "\n";*/
130             
131             switch($state) {
132                 case 'data':
133
134                     /* Consume the next input character */
135                     $char = $this->stream->char();
136                     $lastFourChars .= $char;
137                     if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
138
139                     // see below for meaning
140                     $hyp_cond = 
141                         !$escape &&
142                         (
143                             $this->content_model === self::RCDATA ||
144                             $this->content_model === self::CDATA
145                         );
146                     $amp_cond =
147                         !$escape &&
148                         (
149                             $this->content_model === self::PCDATA ||
150                             $this->content_model === self::RCDATA
151                         );
152                     $lt_cond =
153                         $this->content_model === self::PCDATA ||
154                         (
155                             (
156                                 $this->content_model === self::RCDATA ||
157                                 $this->content_model === self::CDATA
158                              ) &&
159                              !$escape
160                         );
161                     $gt_cond = 
162                         $escape &&
163                         (
164                             $this->content_model === self::RCDATA ||
165                             $this->content_model === self::CDATA
166                         );
167
168                     if($char === '&' && $amp_cond) {
169                         /* U+0026 AMPERSAND (&)
170                         When the content model flag is set to one of the PCDATA or RCDATA
171                         states and the escape flag is false: switch to the
172                         character reference data state. Otherwise: treat it as per
173                         the "anything else" entry below. */
174                         $state = 'characterReferenceData';
175
176                     } elseif(
177                         $char === '-' &&
178                         $hyp_cond &&
179                         $lastFourChars === '<!--'
180                     ) {
181                         /*
182                         U+002D HYPHEN-MINUS (-)
183                         If the content model flag is set to either the RCDATA state or
184                         the CDATA state, and the escape flag is false, and there are at
185                         least three characters before this one in the input stream, and the
186                         last four characters in the input stream, including this one, are
187                         U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
188                         and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
189                         $escape = true;
190
191                         /* In any case, emit the input character as a character token. Stay
192                         in the data state. */
193                         $this->emitToken(array(
194                             'type' => self::CHARACTER,
195                             'data' => '-'
196                         ));
197                         // We do the "any case" part as part of "anything else".
198
199                     /* U+003C LESS-THAN SIGN (<) */
200                     } elseif($char === '<' && $lt_cond) {
201                         /* When the content model flag is set to the PCDATA state: switch
202                         to the tag open state.
203
204                         When the content model flag is set to either the RCDATA state or
205                         the CDATA state and the escape flag is false: switch to the tag
206                         open state.
207
208                         Otherwise: treat it as per the "anything else" entry below. */
209                         $state = 'tagOpen';
210
211                     /* U+003E GREATER-THAN SIGN (>) */
212                     } elseif(
213                         $char === '>' &&
214                         $gt_cond &&
215                         substr($lastFourChars, 1) === '-->'
216                     ) {
217                         /* If the content model flag is set to either the RCDATA state or
218                         the CDATA state, and the escape flag is true, and the last three
219                         characters in the input stream including this one are U+002D
220                         HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
221                         set the escape flag to false. */
222                         $escape = false;
223
224                         /* In any case, emit the input character as a character token.
225                         Stay in the data state. */
226                         $this->emitToken(array(
227                             'type' => self::CHARACTER,
228                             'data' => '>'
229                         ));
230                         // We do the "any case" part as part of "anything else".
231
232                     } elseif($char === false) {
233                         /* EOF
234                         Emit an end-of-file token. */
235                         $state = null;
236                         $this->tree->emitToken(array(
237                             'type' => self::EOF
238                         ));
239                     
240                     } elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
241                         // Directly after emitting a token you switch back to the "data
242                         // state". At that point spaceCharacters are important so they are
243                         // emitted separately.
244                         $chars = $this->stream->charsWhile(self::WHITESPACE);
245                         $this->emitToken(array(
246                             'type' => self::SPACECHARACTER,
247                             'data' => $char . $chars
248                         ));
249                         $lastFourChars .= $chars;
250                         if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
251
252                     } else {
253                         /* Anything else
254                         THIS IS AN OPTIMIZATION: Get as many character that
255                         otherwise would also be treated as a character token and emit it
256                         as a single character token. Stay in the data state. */
257                         
258                         $mask = '';
259                         if ($hyp_cond) $mask .= '-';
260                         if ($amp_cond) $mask .= '&';
261                         if ($lt_cond)  $mask .= '<';
262                         if ($gt_cond)  $mask .= '>';
263
264                         if ($mask === '') {
265                             $chars = $this->stream->remainingChars();
266                         } else {
267                             $chars = $this->stream->charsUntil($mask);
268                         }
269
270                         $this->emitToken(array(
271                             'type' => self::CHARACTER,
272                             'data' => $char . $chars
273                         ));
274
275                         $lastFourChars .= $chars;
276                         if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
277
278                         $state = 'data';
279                     }
280                 break;
281
282                 case 'characterReferenceData':
283                     /* (This cannot happen if the content model flag
284                     is set to the CDATA state.) */
285
286                     /* Attempt to consume a character reference, with no
287                     additional allowed character. */
288                     $entity = $this->consumeCharacterReference();
289
290                     /* If nothing is returned, emit a U+0026 AMPERSAND
291                     character token. Otherwise, emit the character token that
292                     was returned. */
293                     // This is all done when consuming the character reference.
294                     $this->emitToken(array(
295                         'type' => self::CHARACTER,
296                         'data' => $entity
297                     ));
298
299                     /* Finally, switch to the data state. */
300                     $state = 'data';
301                 break;
302
303                 case 'tagOpen':
304                     $char = $this->stream->char();
305
306                     switch($this->content_model) {
307                         case self::RCDATA:
308                         case self::CDATA:
309                             /* Consume the next input character. If it is a
310                             U+002F SOLIDUS (/) character, switch to the close
311                             tag open state. Otherwise, emit a U+003C LESS-THAN
312                             SIGN character token and reconsume the current input
313                             character in the data state. */
314                             // We consumed above.
315
316                             if($char === '/') {
317                                 $state = 'closeTagOpen';
318
319                             } else {
320                                 $this->emitToken(array(
321                                     'type' => self::CHARACTER,
322                                     'data' => '<'
323                                 ));
324
325                                 $this->stream->unget();
326
327                                 $state = 'data';
328                             }
329                         break;
330
331                         case self::PCDATA:
332                             /* If the content model flag is set to the PCDATA state
333                             Consume the next input character: */
334                             // We consumed above.
335
336                             if($char === '!') {
337                                 /* U+0021 EXCLAMATION MARK (!)
338                                 Switch to the markup declaration open state. */
339                                 $state = 'markupDeclarationOpen';
340
341                             } elseif($char === '/') {
342                                 /* U+002F SOLIDUS (/)
343                                 Switch to the close tag open state. */
344                                 $state = 'closeTagOpen';
345
346                             } elseif('A' <= $char && $char <= 'Z') {
347                                 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
348                                 Create a new start tag token, set its tag name to the lowercase
349                                 version of the input character (add 0x0020 to the character's code
350                                 point), then switch to the tag name state. (Don't emit the token
351                                 yet; further details will be filled in before it is emitted.) */
352                                 $this->token = array(
353                                     'name'  => strtolower($char),
354                                     'type'  => self::STARTTAG,
355                                     'attr'  => array()
356                                 );
357
358                                 $state = 'tagName';
359
360                             } elseif('a' <= $char && $char <= 'z') {
361                                 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
362                                 Create a new start tag token, set its tag name to the input
363                                 character, then switch to the tag name state. (Don't emit
364                                 the token yet; further details will be filled in before it
365                                 is emitted.) */
366                                 $this->token = array(
367                                     'name'  => $char,
368                                     'type'  => self::STARTTAG,
369                                     'attr'  => array()
370                                 );
371
372                                 $state = 'tagName';
373
374                             } elseif($char === '>') {
375                                 /* U+003E GREATER-THAN SIGN (>)
376                                 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
377                                 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
378                                 $this->emitToken(array(
379                                     'type' => self::PARSEERROR,
380                                     'data' => 'expected-tag-name-but-got-right-bracket'
381                                 ));
382                                 $this->emitToken(array(
383                                     'type' => self::CHARACTER,
384                                     'data' => '<>'
385                                 ));
386
387                                 $state = 'data';
388
389                             } elseif($char === '?') {
390                                 /* U+003F QUESTION MARK (?)
391                                 Parse error. Switch to the bogus comment state. */
392                                 $this->emitToken(array(
393                                     'type' => self::PARSEERROR,
394                                     'data' => 'expected-tag-name-but-got-question-mark'
395                                 ));
396                                 $this->token = array(
397                                     'data' => '?',
398                                     'type' => self::COMMENT
399                                 );
400                                 $state = 'bogusComment';
401
402                             } else {
403                                 /* Anything else
404                                 Parse error. Emit a U+003C LESS-THAN SIGN character token and
405                                 reconsume the current input character in the data state. */
406                                 $this->emitToken(array(
407                                     'type' => self::PARSEERROR,
408                                     'data' => 'expected-tag-name'
409                                 ));
410                                 $this->emitToken(array(
411                                     'type' => self::CHARACTER,
412                                     'data' => '<'
413                                 ));
414
415                                 $state = 'data';
416                                 $this->stream->unget();
417                             }
418                         break;
419                     }
420                 break;
421
422                 case 'closeTagOpen':
423                     if (
424                         $this->content_model === self::RCDATA ||
425                         $this->content_model === self::CDATA
426                     ) {
427                         /* If the content model flag is set to the RCDATA or CDATA
428                         states... */
429                         $name = strtolower($this->stream->charsWhile(self::ALPHA));
430                         $following = $this->stream->char();
431                         $this->stream->unget();
432                         if (
433                             !$this->token ||
434                             $this->token['name'] !== $name ||
435                             $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
436                         ) {
437                             /* if no start tag token has ever been emitted by this instance
438                             of the tokenizer (fragment case), or, if the next few
439                             characters do not match the tag name of the last start tag
440                             token emitted (compared in an ASCII case-insensitive manner),
441                             or if they do but they are not immediately followed by one of
442                             the following characters:
443
444                                 * U+0009 CHARACTER TABULATION
445                                 * U+000A LINE FEED (LF)
446                                 * U+000C FORM FEED (FF)
447                                 * U+0020 SPACE
448                                 * U+003E GREATER-THAN SIGN (>)
449                                 * U+002F SOLIDUS (/)
450                                 * EOF
451
452                             ...then emit a U+003C LESS-THAN SIGN character token, a
453                             U+002F SOLIDUS character token, and switch to the data
454                             state to process the next input character. */
455                             // XXX: Probably ought to replace in_array with $following === x ||...
456
457                             // We also need to emit $name now we've consumed that, as we
458                             // know it'll just be emitted as a character token.
459                             $this->emitToken(array(
460                                 'type' => self::CHARACTER,
461                                 'data' => '</' . $name
462                             ));
463
464                             $state = 'data';
465                         } else {
466                             // This matches what would happen if we actually did the
467                             // otherwise below (but we can't because we've consumed too
468                             // much).
469
470                             // Start the end tag token with the name we already have.
471                             $this->token = array(
472                                 'name'  => $name,
473                                 'type'  => self::ENDTAG
474                             );
475
476                             // Change to tag name state.
477                             $state = 'tagName';
478                         }
479                     } elseif ($this->content_model === self::PCDATA) {
480                         /* Otherwise, if the content model flag is set to the PCDATA
481                         state [...]: */
482                         $char = $this->stream->char();
483
484                         if ('A' <= $char && $char <= 'Z') {
485                             /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
486                             Create a new end tag token, set its tag name to the lowercase version
487                             of the input character (add 0x0020 to the character's code point), then
488                             switch to the tag name state. (Don't emit the token yet; further details
489                             will be filled in before it is emitted.) */
490                             $this->token = array(
491                                 'name'  => strtolower($char),
492                                 'type'  => self::ENDTAG
493                             );
494
495                             $state = 'tagName';
496
497                         } elseif ('a' <= $char && $char <= 'z') {
498                             /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
499                             Create a new end tag token, set its tag name to the
500                             input character, then switch to the tag name state.
501                             (Don't emit the token yet; further details will be
502                             filled in before it is emitted.) */
503                             $this->token = array(
504                                 'name'  => $char,
505                                 'type'  => self::ENDTAG
506                             );
507
508                             $state = 'tagName';
509
510                         } elseif($char === '>') {
511                             /* U+003E GREATER-THAN SIGN (>)
512                             Parse error. Switch to the data state. */
513                             $this->emitToken(array(
514                                 'type' => self::PARSEERROR,
515                                 'data' => 'expected-closing-tag-but-got-right-bracket'
516                             ));
517                             $state = 'data';
518
519                         } elseif($char === false) {
520                             /* EOF
521                             Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
522                             SOLIDUS character token. Reconsume the EOF character in the data state. */
523                             $this->emitToken(array(
524                                 'type' => self::PARSEERROR,
525                                 'data' => 'expected-closing-tag-but-got-eof'
526                             ));
527                             $this->emitToken(array(
528                                 'type' => self::CHARACTER,
529                                 'data' => '</'
530                             ));
531
532                             $this->stream->unget();
533                             $state = 'data';
534
535                         } else {
536                             /* Parse error. Switch to the bogus comment state. */
537                             $this->emitToken(array(
538                                 'type' => self::PARSEERROR,
539                                 'data' => 'expected-closing-tag-but-got-char'
540                             ));
541                             $this->token = array(
542                                 'data' => $char,
543                                 'type' => self::COMMENT
544                             );
545                             $state = 'bogusComment';
546                         }
547                     }
548                 break;
549
550                 case 'tagName':
551                     /* Consume the next input character: */
552                     $char = $this->stream->char();
553
554                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
555                         /* U+0009 CHARACTER TABULATION
556                         U+000A LINE FEED (LF)
557                         U+000C FORM FEED (FF)
558                         U+0020 SPACE
559                         Switch to the before attribute name state. */
560                         $state = 'beforeAttributeName';
561
562                     } elseif($char === '/') {
563                         /* U+002F SOLIDUS (/)
564                         Switch to the self-closing start tag state. */
565                         $state = 'selfClosingStartTag';
566
567                     } elseif($char === '>') {
568                         /* U+003E GREATER-THAN SIGN (>)
569                         Emit the current tag token. Switch to the data state. */
570                         $this->emitToken($this->token);
571                         $state = 'data';
572
573                     } elseif('A' <= $char && $char <= 'Z') {
574                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
575                         Append the lowercase version of the current input
576                         character (add 0x0020 to the character's code point) to
577                         the current tag token's tag name. Stay in the tag name state. */
578                         $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
579
580                         $this->token['name'] .= strtolower($char . $chars);
581                         $state = 'tagName';
582
583                     } elseif($char === false) {
584                         /* EOF
585                         Parse error. Emit the current tag token. Reconsume the EOF
586                         character in the data state. */
587                         $this->emitToken(array(
588                             'type' => self::PARSEERROR,
589                             'data' => 'eof-in-tag-name'
590                         ));
591                         $this->emitToken($this->token);
592
593                         $this->stream->unget();
594                         $state = 'data';
595
596                     } else {
597                         /* Anything else
598                         Append the current input character to the current tag token's tag name.
599                         Stay in the tag name state. */
600                         $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
601
602                         $this->token['name'] .= $char . $chars;
603                         $state = 'tagName';
604                     }
605                 break;
606
607                 case 'beforeAttributeName':
608                     /* Consume the next input character: */
609                     $char = $this->stream->char();
610
611                     // this conditional is optimized, check bottom
612                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
613                         /* U+0009 CHARACTER TABULATION
614                         U+000A LINE FEED (LF)
615                         U+000C FORM FEED (FF)
616                         U+0020 SPACE
617                         Stay in the before attribute name state. */
618                         $state = 'beforeAttributeName';
619
620                     } elseif($char === '/') {
621                         /* U+002F SOLIDUS (/)
622                         Switch to the self-closing start tag state. */
623                         $state = 'selfClosingStartTag';
624
625                     } elseif($char === '>') {
626                         /* U+003E GREATER-THAN SIGN (>)
627                         Emit the current tag token. Switch to the data state. */
628                         $this->emitToken($this->token);
629                         $state = 'data';
630
631                     } elseif('A' <= $char && $char <= 'Z') {
632                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
633                         Start a new attribute in the current tag token. Set that
634                         attribute's name to the lowercase version of the current
635                         input character (add 0x0020 to the character's code
636                         point), and its value to the empty string. Switch to the
637                         attribute name state.*/
638                         $this->token['attr'][] = array(
639                             'name'  => strtolower($char),
640                             'value' => ''
641                         );
642
643                         $state = 'attributeName';
644
645                     } elseif($char === false) {
646                         /* EOF
647                         Parse error. Emit the current tag token. Reconsume the EOF
648                         character in the data state. */
649                         $this->emitToken(array(
650                             'type' => self::PARSEERROR,
651                             'data' => 'expected-attribute-name-but-got-eof'
652                         ));
653                         $this->emitToken($this->token);
654
655                         $this->stream->unget();
656                         $state = 'data';
657
658                     } else {
659                         /* U+0022 QUOTATION MARK (")
660                            U+0027 APOSTROPHE (')
661                            U+003D EQUALS SIGN (=)
662                         Parse error. Treat it as per the "anything else" entry
663                         below. */
664                         if($char === '"' || $char === "'" || $char === '=') {
665                             $this->emitToken(array(
666                                 'type' => self::PARSEERROR,
667                                 'data' => 'invalid-character-in-attribute-name'
668                             ));
669                         }
670
671                         /* Anything else
672                         Start a new attribute in the current tag token. Set that attribute's
673                         name to the current input character, and its value to the empty string.
674                         Switch to the attribute name state. */
675                         $this->token['attr'][] = array(
676                             'name'  => $char,
677                             'value' => ''
678                         );
679
680                         $state = 'attributeName';
681                     }
682                 break;
683
684                 case 'attributeName':
685                     // Consume the next input character:
686                     $char = $this->stream->char();
687
688                     // this conditional is optimized, check bottom
689                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
690                         /* U+0009 CHARACTER TABULATION
691                         U+000A LINE FEED (LF)
692                         U+000C FORM FEED (FF)
693                         U+0020 SPACE
694                         Switch to the after attribute name state. */
695                         $state = 'afterAttributeName';
696
697                     } elseif($char === '/') {
698                         /* U+002F SOLIDUS (/)
699                         Switch to the self-closing start tag state. */
700                         $state = 'selfClosingStartTag';
701
702                     } elseif($char === '=') {
703                         /* U+003D EQUALS SIGN (=)
704                         Switch to the before attribute value state. */
705                         $state = 'beforeAttributeValue';
706
707                     } elseif($char === '>') {
708                         /* U+003E GREATER-THAN SIGN (>)
709                         Emit the current tag token. Switch to the data state. */
710                         $this->emitToken($this->token);
711                         $state = 'data';
712
713                     } elseif('A' <= $char && $char <= 'Z') {
714                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
715                         Append the lowercase version of the current input
716                         character (add 0x0020 to the character's code point) to
717                         the current attribute's name. Stay in the attribute name
718                         state. */
719                         $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
720
721                         $last = count($this->token['attr']) - 1;
722                         $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
723
724                         $state = 'attributeName';
725
726                     } elseif($char === false) {
727                         /* EOF
728                         Parse error. Emit the current tag token. Reconsume the EOF
729                         character in the data state. */
730                         $this->emitToken(array(
731                             'type' => self::PARSEERROR,
732                             'data' => 'eof-in-attribute-name'
733                         ));
734                         $this->emitToken($this->token);
735
736                         $this->stream->unget();
737                         $state = 'data';
738
739                     } else {
740                         /* U+0022 QUOTATION MARK (")
741                            U+0027 APOSTROPHE (')
742                         Parse error. Treat it as per the "anything else"
743                         entry below. */
744                         if($char === '"' || $char === "'") {
745                             $this->emitToken(array(
746                                 'type' => self::PARSEERROR,
747                                 'data' => 'invalid-character-in-attribute-name'
748                             ));
749                         }
750
751                         /* Anything else
752                         Append the current input character to the current attribute's name.
753                         Stay in the attribute name state. */
754                         $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
755
756                         $last = count($this->token['attr']) - 1;
757                         $this->token['attr'][$last]['name'] .= $char . $chars;
758
759                         $state = 'attributeName';
760                     }
761
762                     /* When the user agent leaves the attribute name state
763                     (and before emitting the tag token, if appropriate), the
764                     complete attribute's name must be compared to the other
765                     attributes on the same token; if there is already an
766                     attribute on the token with the exact same name, then this
767                     is a parse error and the new attribute must be dropped, along
768                     with the value that gets associated with it (if any). */
769                     // this might be implemented in the emitToken method
770                 break;
771
772                 case 'afterAttributeName':
773                     // Consume the next input character:
774                     $char = $this->stream->char();
775
776                     // this is an optimized conditional, check the bottom
777                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
778                         /* U+0009 CHARACTER TABULATION
779                         U+000A LINE FEED (LF)
780                         U+000C FORM FEED (FF)
781                         U+0020 SPACE
782                         Stay in the after attribute name state. */
783                         $state = 'afterAttributeName';
784
785                     } elseif($char === '/') {
786                         /* U+002F SOLIDUS (/)
787                         Switch to the self-closing start tag state. */
788                         $state = 'selfClosingStartTag';
789
790                     } elseif($char === '=') {
791                         /* U+003D EQUALS SIGN (=)
792                         Switch to the before attribute value state. */
793                         $state = 'beforeAttributeValue';
794
795                     } elseif($char === '>') {
796                         /* U+003E GREATER-THAN SIGN (>)
797                         Emit the current tag token. Switch to the data state. */
798                         $this->emitToken($this->token);
799                         $state = 'data';
800
801                     } elseif('A' <= $char && $char <= 'Z') {
802                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
803                         Start a new attribute in the current tag token. Set that
804                         attribute's name to the lowercase version of the current
805                         input character (add 0x0020 to the character's code
806                         point), and its value to the empty string. Switch to the
807                         attribute name state. */
808                         $this->token['attr'][] = array(
809                             'name'  => strtolower($char),
810                             'value' => ''
811                         );
812
813                         $state = 'attributeName';
814
815                     } elseif($char === false) {
816                         /* EOF
817                         Parse error. Emit the current tag token. Reconsume the EOF
818                         character in the data state. */
819                         $this->emitToken(array(
820                             'type' => self::PARSEERROR,
821                             'data' => 'expected-end-of-tag-but-got-eof'
822                         ));
823                         $this->emitToken($this->token);
824
825                         $this->stream->unget();
826                         $state = 'data';
827
828                     } else {
829                         /* U+0022 QUOTATION MARK (")
830                            U+0027 APOSTROPHE (')
831                         Parse error. Treat it as per the "anything else"
832                         entry below. */
833                         if($char === '"' || $char === "'") {
834                             $this->emitToken(array(
835                                 'type' => self::PARSEERROR,
836                                 'data' => 'invalid-character-after-attribute-name'
837                             ));
838                         }
839
840                         /* Anything else
841                         Start a new attribute in the current tag token. Set that attribute's
842                         name to the current input character, and its value to the empty string.
843                         Switch to the attribute name state. */
844                         $this->token['attr'][] = array(
845                             'name'  => $char,
846                             'value' => ''
847                         );
848
849                         $state = 'attributeName';
850                     }
851                 break;
852
853                 case 'beforeAttributeValue':
854                     // Consume the next input character:
855                     $char = $this->stream->char();
856
857                     // this is an optimized conditional
858                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
859                         /* U+0009 CHARACTER TABULATION
860                         U+000A LINE FEED (LF)
861                         U+000C FORM FEED (FF)
862                         U+0020 SPACE
863                         Stay in the before attribute value state. */
864                         $state = 'beforeAttributeValue';
865
866                     } elseif($char === '"') {
867                         /* U+0022 QUOTATION MARK (")
868                         Switch to the attribute value (double-quoted) state. */
869                         $state = 'attributeValueDoubleQuoted';
870
871                     } elseif($char === '&') {
872                         /* U+0026 AMPERSAND (&)
873                         Switch to the attribute value (unquoted) state and reconsume
874                         this input character. */
875                         $this->stream->unget();
876                         $state = 'attributeValueUnquoted';
877
878                     } elseif($char === '\'') {
879                         /* U+0027 APOSTROPHE (')
880                         Switch to the attribute value (single-quoted) state. */
881                         $state = 'attributeValueSingleQuoted';
882
883                     } elseif($char === '>') {
884                         /* U+003E GREATER-THAN SIGN (>)
885                         Parse error. Emit the current tag token. Switch to the data state. */
886                         $this->emitToken(array(
887                             'type' => self::PARSEERROR,
888                             'data' => 'expected-attribute-value-but-got-right-bracket'
889                         ));
890                         $this->emitToken($this->token);
891                         $state = 'data';
892
893                     } elseif($char === false) {
894                         /* EOF
895                         Parse error. Emit the current tag token. Reconsume
896                         the character in the data state. */
897                         $this->emitToken(array(
898                             'type' => self::PARSEERROR,
899                             'data' => 'expected-attribute-value-but-got-eof'
900                         ));
901                         $this->emitToken($this->token);
902                         $this->stream->unget();
903                         $state = 'data';
904
905                     } else {
906                         /* U+003D EQUALS SIGN (=)
907                         Parse error. Treat it as per the "anything else" entry below. */
908                         if($char === '=') {
909                             $this->emitToken(array(
910                                 'type' => self::PARSEERROR,
911                                 'data' => 'equals-in-unquoted-attribute-value'
912                             ));
913                         }
914
915                         /* Anything else
916                         Append the current input character to the current attribute's value.
917                         Switch to the attribute value (unquoted) state. */
918                         $last = count($this->token['attr']) - 1;
919                         $this->token['attr'][$last]['value'] .= $char;
920
921                         $state = 'attributeValueUnquoted';
922                     }
923                 break;
924
925                 case 'attributeValueDoubleQuoted':
926                     // Consume the next input character:
927                     $char = $this->stream->char();
928
929                     if($char === '"') {
930                         /* U+0022 QUOTATION MARK (")
931                         Switch to the after attribute value (quoted) state. */
932                         $state = 'afterAttributeValueQuoted';
933
934                     } elseif($char === '&') {
935                         /* U+0026 AMPERSAND (&)
936                         Switch to the character reference in attribute value
937                         state, with the additional allowed character
938                         being U+0022 QUOTATION MARK ("). */
939                         $this->characterReferenceInAttributeValue('"');
940
941                     } elseif($char === false) {
942                         /* EOF
943                         Parse error. Emit the current tag token. Reconsume the character
944                         in the data state. */
945                         $this->emitToken(array(
946                             'type' => self::PARSEERROR,
947                             'data' => 'eof-in-attribute-value-double-quote'
948                         ));
949                         $this->emitToken($this->token);
950
951                         $this->stream->unget();
952                         $state = 'data';
953
954                     } else {
955                         /* Anything else
956                         Append the current input character to the current attribute's value.
957                         Stay in the attribute value (double-quoted) state. */
958                         $chars = $this->stream->charsUntil('"&');
959
960                         $last = count($this->token['attr']) - 1;
961                         $this->token['attr'][$last]['value'] .= $char . $chars;
962
963                         $state = 'attributeValueDoubleQuoted';
964                     }
965                 break;
966
967                 case 'attributeValueSingleQuoted':
968                     // Consume the next input character:
969                     $char = $this->stream->char();
970
971                     if($char === "'") {
972                         /* U+0022 QUOTATION MARK (')
973                         Switch to the after attribute value state. */
974                         $state = 'afterAttributeValueQuoted';
975
976                     } elseif($char === '&') {
977                         /* U+0026 AMPERSAND (&)
978                         Switch to the entity in attribute value state. */
979                         $this->characterReferenceInAttributeValue("'");
980
981                     } elseif($char === false) {
982                         /* EOF
983                         Parse error. Emit the current tag token. Reconsume the character
984                         in the data state. */
985                         $this->emitToken(array(
986                             'type' => self::PARSEERROR,
987                             'data' => 'eof-in-attribute-value-single-quote'
988                         ));
989                         $this->emitToken($this->token);
990
991                         $this->stream->unget();
992                         $state = 'data';
993
994                     } else {
995                         /* Anything else
996                         Append the current input character to the current attribute's value.
997                         Stay in the attribute value (single-quoted) state. */
998                         $chars = $this->stream->charsUntil("'&");
999
1000                         $last = count($this->token['attr']) - 1;
1001                         $this->token['attr'][$last]['value'] .= $char . $chars;
1002
1003                         $state = 'attributeValueSingleQuoted';
1004                     }
1005                 break;
1006
1007                 case 'attributeValueUnquoted':
1008                     // Consume the next input character:
1009                     $char = $this->stream->char();
1010
1011                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1012                         /* U+0009 CHARACTER TABULATION
1013                         U+000A LINE FEED (LF)
1014                         U+000C FORM FEED (FF)
1015                         U+0020 SPACE
1016                         Switch to the before attribute name state. */
1017                         $state = 'beforeAttributeName';
1018
1019                     } elseif($char === '&') {
1020                         /* U+0026 AMPERSAND (&)
1021                         Switch to the entity in attribute value state. */
1022                         $this->characterReferenceInAttributeValue();
1023
1024                     } elseif($char === '>') {
1025                         /* U+003E GREATER-THAN SIGN (>)
1026                         Emit the current tag token. Switch to the data state. */
1027                         $this->emitToken($this->token);
1028                         $state = 'data';
1029
1030                     } elseif ($char === false) {
1031                         /* EOF
1032                         Parse error. Emit the current tag token. Reconsume
1033                         the character in the data state. */
1034                         $this->emitToken(array(
1035                             'type' => self::PARSEERROR,
1036                             'data' => 'eof-in-attribute-value-no-quotes'
1037                         ));
1038                         $this->emitToken($this->token);
1039                         $this->stream->unget();
1040                         $state = 'data';
1041
1042                     } else {
1043                         /* U+0022 QUOTATION MARK (")
1044                            U+0027 APOSTROPHE (')
1045                            U+003D EQUALS SIGN (=)
1046                         Parse error. Treat it as per the "anything else"
1047                         entry below. */
1048                         if($char === '"' || $char === "'" || $char === '=') {
1049                             $this->emitToken(array(
1050                                 'type' => self::PARSEERROR,
1051                                 'data' => 'unexpected-character-in-unquoted-attribute-value'
1052                             ));
1053                         }
1054
1055                         /* Anything else
1056                         Append the current input character to the current attribute's value.
1057                         Stay in the attribute value (unquoted) state. */
1058                         $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
1059
1060                         $last = count($this->token['attr']) - 1;
1061                         $this->token['attr'][$last]['value'] .= $char . $chars;
1062
1063                         $state = 'attributeValueUnquoted';
1064                     }
1065                 break;
1066
1067                 case 'afterAttributeValueQuoted':
1068                     /* Consume the next input character: */
1069                     $char = $this->stream->char();
1070
1071                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1072                         /* U+0009 CHARACTER TABULATION
1073                            U+000A LINE FEED (LF)
1074                            U+000C FORM FEED (FF)
1075                            U+0020 SPACE
1076                         Switch to the before attribute name state. */
1077                         $state = 'beforeAttributeName';
1078
1079                     } elseif ($char === '/') {
1080                         /* U+002F SOLIDUS (/)
1081                         Switch to the self-closing start tag state. */
1082                         $state = 'selfClosingStartTag';
1083
1084                     } elseif ($char === '>') {
1085                         /* U+003E GREATER-THAN SIGN (>)
1086                         Emit the current tag token. Switch to the data state. */
1087                         $this->emitToken($this->token);
1088                         $state = 'data';
1089
1090                     } elseif ($char === false) {
1091                         /* EOF
1092                         Parse error. Emit the current tag token. Reconsume the EOF
1093                         character in the data state. */
1094                         $this->emitToken(array(
1095                             'type' => self::PARSEERROR,
1096                             'data' => 'unexpected-EOF-after-attribute-value'
1097                         ));
1098                         $this->emitToken($this->token);
1099                         $this->stream->unget();
1100                         $state = 'data';
1101
1102                     } else {
1103                         /* Anything else
1104                         Parse error. Reconsume the character in the before attribute
1105                         name state. */
1106                         $this->emitToken(array(
1107                             'type' => self::PARSEERROR,
1108                             'data' => 'unexpected-character-after-attribute-value'
1109                         ));
1110                         $this->stream->unget();
1111                         $state = 'beforeAttributeName';
1112                     }
1113                 break;
1114
1115                 case 'selfClosingStartTag':
1116                     /* Consume the next input character: */
1117                     $char = $this->stream->char();
1118
1119                     if ($char === '>') {
1120                         /* U+003E GREATER-THAN SIGN (>)
1121                         Set the self-closing flag of the current tag token.
1122                         Emit the current tag token. Switch to the data state. */
1123                         // not sure if this is the name we want
1124                         $this->token['self-closing'] = true;
1125                         /* When an end tag token is emitted with its self-closing flag set,
1126                         that is a parse error. */
1127                         if ($this->token['type'] === self::ENDTAG) {
1128                             $this->emitToken(array(
1129                                 'type' => self::PARSEERROR,
1130                                 'data' => 'self-closing-end-tag'
1131                             ));
1132                         }
1133                         $this->emitToken($this->token);
1134                         $state = 'data';
1135
1136                     } elseif ($char === false) {
1137                         /* EOF
1138                         Parse error. Emit the current tag token. Reconsume the
1139                         EOF character in the data state. */
1140                         $this->emitToken(array(
1141                             'type' => self::PARSEERROR,
1142                             'data' => 'unexpected-eof-after-self-closing'
1143                         ));
1144                         $this->emitToken($this->token);
1145                         $this->stream->unget();
1146                         $state = 'data';
1147
1148                     } else {
1149                         /* Anything else
1150                         Parse error. Reconsume the character in the before attribute name state. */
1151                         $this->emitToken(array(
1152                             'type' => self::PARSEERROR,
1153                             'data' => 'unexpected-character-after-self-closing'
1154                         ));
1155                         $this->stream->unget();
1156                         $state = 'beforeAttributeName';
1157                     }
1158                 break;
1159
1160                 case 'bogusComment':
1161                     /* (This can only happen if the content model flag is set to the PCDATA state.) */
1162                     /* Consume every character up to the first U+003E GREATER-THAN SIGN
1163                     character (>) or the end of the file (EOF), whichever comes first. Emit
1164                     a comment token whose data is the concatenation of all the characters
1165                     starting from and including the character that caused the state machine
1166                     to switch into the bogus comment state, up to and including the last
1167                     consumed character before the U+003E character, if any, or up to the
1168                     end of the file otherwise. (If the comment was started by the end of
1169                     the file (EOF), the token is empty.) */
1170                     $this->token['data'] .= (string) $this->stream->charsUntil('>');
1171                     $this->stream->char();
1172
1173                     $this->emitToken($this->token);
1174
1175                     /* Switch to the data state. */
1176                     $state = 'data';
1177                 break;
1178
1179                 case 'markupDeclarationOpen':
1180                     // Consume for below
1181                     $hyphens = $this->stream->charsWhile('-', 2);
1182                     if ($hyphens === '-') {
1183                         $this->stream->unget();
1184                     }
1185                     if ($hyphens !== '--') {
1186                         $alpha = $this->stream->charsWhile(self::ALPHA, 7);
1187                     }
1188
1189                     /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1190                     characters, consume those two characters, create a comment token whose
1191                     data is the empty string, and switch to the comment state. */
1192                     if($hyphens === '--') {
1193                         $state = 'commentStart';
1194                         $this->token = array(
1195                             'data' => '',
1196                             'type' => self::COMMENT
1197                         );
1198
1199                     /* Otherwise if the next seven characters are a case-insensitive match
1200                     for the word "DOCTYPE", then consume those characters and switch to the
1201                     DOCTYPE state. */
1202                     } elseif(strtoupper($alpha) === 'DOCTYPE') {
1203                         $state = 'doctype';
1204
1205                     // XXX not implemented
1206                     /* Otherwise, if the insertion mode is "in foreign content"
1207                     and the current node is not an element in the HTML namespace
1208                     and the next seven characters are an ASCII case-sensitive
1209                     match for the string "[CDATA[" (the five uppercase letters
1210                     "CDATA" with a U+005B LEFT SQUARE BRACKET character before
1211                     and after), then consume those characters and switch to the
1212                     CDATA section state (which is unrelated to the content model
1213                     flag's CDATA state). */
1214
1215                     /* Otherwise, is is a parse error. Switch to the bogus comment state.
1216                     The next character that is consumed, if any, is the first character
1217                     that will be in the comment. */
1218                     } else {
1219                         $this->emitToken(array(
1220                             'type' => self::PARSEERROR,
1221                             'data' => 'expected-dashes-or-doctype'
1222                         ));
1223                         $this->token = array(
1224                             'data' => (string) $alpha,
1225                             'type' => self::COMMENT
1226                         );
1227                         $state = 'bogusComment';
1228                     }
1229                 break;
1230
1231                 case 'commentStart':
1232                     /* Consume the next input character: */
1233                     $char = $this->stream->char();
1234
1235                     if ($char === '-') {
1236                         /* U+002D HYPHEN-MINUS (-)
1237                         Switch to the comment start dash state. */
1238                         $state = 'commentStartDash';
1239                     } elseif ($char === '>') {
1240                         /* U+003E GREATER-THAN SIGN (>)
1241                         Parse error. Emit the comment token. Switch to the
1242                         data state. */
1243                         $this->emitToken(array(
1244                             'type' => self::PARSEERROR,
1245                             'data' => 'incorrect-comment'
1246                         ));
1247                         $this->emitToken($this->token);
1248                         $state = 'data';
1249                     } elseif ($char === false) {
1250                         /* EOF
1251                         Parse error. Emit the comment token. Reconsume the
1252                         EOF character in the data state. */
1253                         $this->emitToken(array(
1254                             'type' => self::PARSEERROR,
1255                             'data' => 'eof-in-comment'
1256                         ));
1257                         $this->emitToken($this->token);
1258                         $this->stream->unget();
1259                         $state = 'data';
1260                     } else {
1261                         /* Anything else
1262                         Append the input character to the comment token's
1263                         data. Switch to the comment state. */
1264                         $this->token['data'] .= $char;
1265                         $state = 'comment';
1266                     }
1267                 break;
1268
1269                 case 'commentStartDash':
1270                     /* Consume the next input character: */
1271                     $char = $this->stream->char();
1272                     if ($char === '-') {
1273                         /* U+002D HYPHEN-MINUS (-)
1274                         Switch to the comment end state */
1275                         $state = 'commentEnd';
1276                     } elseif ($char === '>') {
1277                         /* U+003E GREATER-THAN SIGN (>)
1278                         Parse error. Emit the comment token. Switch to the
1279                         data state. */
1280                         $this->emitToken(array(
1281                             'type' => self::PARSEERROR,
1282                             'data' => 'incorrect-comment'
1283                         ));
1284                         $this->emitToken($this->token);
1285                         $state = 'data';
1286                     } elseif ($char === false) {
1287                         /* Parse error. Emit the comment token. Reconsume the
1288                         EOF character in the data state. */
1289                         $this->emitToken(array(
1290                             'type' => self::PARSEERROR,
1291                             'data' => 'eof-in-comment'
1292                         ));
1293                         $this->emitToken($this->token);
1294                         $this->stream->unget();
1295                         $state = 'data';
1296                     } else {
1297                         $this->token['data'] .= '-' . $char;
1298                         $state = 'comment';
1299                     }
1300                 break;
1301
1302                 case 'comment':
1303                     /* Consume the next input character: */
1304                     $char = $this->stream->char();
1305
1306                     if($char === '-') {
1307                         /* U+002D HYPHEN-MINUS (-)
1308                         Switch to the comment end dash state */
1309                         $state = 'commentEndDash';
1310
1311                     } elseif($char === false) {
1312                         /* EOF
1313                         Parse error. Emit the comment token. Reconsume the EOF character
1314                         in the data state. */
1315                         $this->emitToken(array(
1316                             'type' => self::PARSEERROR,
1317                             'data' => 'eof-in-comment'
1318                         ));
1319                         $this->emitToken($this->token);
1320                         $this->stream->unget();
1321                         $state = 'data';
1322
1323                     } else {
1324                         /* Anything else
1325                         Append the input character to the comment token's data. Stay in
1326                         the comment state. */
1327                         $chars = $this->stream->charsUntil('-');
1328
1329                         $this->token['data'] .= $char . $chars;
1330                     }
1331                 break;
1332
1333                 case 'commentEndDash':
1334                     /* Consume the next input character: */
1335                     $char = $this->stream->char();
1336
1337                     if($char === '-') {
1338                         /* U+002D HYPHEN-MINUS (-)
1339                         Switch to the comment end state  */
1340                         $state = 'commentEnd';
1341
1342                     } elseif($char === false) {
1343                         /* EOF
1344                         Parse error. Emit the comment token. Reconsume the EOF character
1345                         in the data state. */
1346                         $this->emitToken(array(
1347                             'type' => self::PARSEERROR,
1348                             'data' => 'eof-in-comment-end-dash'
1349                         ));
1350                         $this->emitToken($this->token);
1351                         $this->stream->unget();
1352                         $state = 'data';
1353
1354                     } else {
1355                         /* Anything else
1356                         Append a U+002D HYPHEN-MINUS (-) character and the input
1357                         character to the comment token's data. Switch to the comment state. */
1358                         $this->token['data'] .= '-'.$char;
1359                         $state = 'comment';
1360                     }
1361                 break;
1362
1363                 case 'commentEnd':
1364                     /* Consume the next input character: */
1365                     $char = $this->stream->char();
1366
1367                     if($char === '>') {
1368                         /* U+003E GREATER-THAN SIGN (>)
1369                         Emit the comment token. Switch to the data state. */
1370                         $this->emitToken($this->token);
1371                         $state = 'data';
1372
1373                     } elseif($char === '-') {
1374                         /* U+002D HYPHEN-MINUS (-)
1375                         Parse error. Append a U+002D HYPHEN-MINUS (-) character
1376                         to the comment token's data. Stay in the comment end
1377                         state. */
1378                         $this->emitToken(array(
1379                             'type' => self::PARSEERROR,
1380                             'data' => 'unexpected-dash-after-double-dash-in-comment'
1381                         ));
1382                         $this->token['data'] .= '-';
1383
1384                     } elseif($char === false) {
1385                         /* EOF
1386                         Parse error. Emit the comment token. Reconsume the
1387                         EOF character in the data state. */
1388                         $this->emitToken(array(
1389                             'type' => self::PARSEERROR,
1390                             'data' => 'eof-in-comment-double-dash'
1391                         ));
1392                         $this->emitToken($this->token);
1393                         $this->stream->unget();
1394                         $state = 'data';
1395
1396                     } else {
1397                         /* Anything else
1398                         Parse error. Append two U+002D HYPHEN-MINUS (-)
1399                         characters and the input character to the comment token's
1400                         data. Switch to the comment state. */
1401                         $this->emitToken(array(
1402                             'type' => self::PARSEERROR,
1403                             'data' => 'unexpected-char-in-comment'
1404                         ));
1405                         $this->token['data'] .= '--'.$char;
1406                         $state = 'comment';
1407                     }
1408                 break;
1409
1410                 case 'doctype':
1411                     /* Consume the next input character: */
1412                     $char = $this->stream->char();
1413
1414                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1415                         /* U+0009 CHARACTER TABULATION
1416                            U+000A LINE FEED (LF)
1417                            U+000C FORM FEED (FF)
1418                            U+0020 SPACE
1419                         Switch to the before DOCTYPE name state. */
1420                         $state = 'beforeDoctypeName';
1421
1422                     } else {
1423                         /* Anything else
1424                         Parse error. Reconsume the current character in the
1425                         before DOCTYPE name state. */
1426                         $this->emitToken(array(
1427                             'type' => self::PARSEERROR,
1428                             'data' => 'need-space-after-doctype'
1429                         ));
1430                         $this->stream->unget();
1431                         $state = 'beforeDoctypeName';
1432                     }
1433                 break;
1434
1435                 case 'beforeDoctypeName':
1436                     /* Consume the next input character: */
1437                     $char = $this->stream->char();
1438
1439                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1440                         /* U+0009 CHARACTER TABULATION
1441                            U+000A LINE FEED (LF)
1442                            U+000C FORM FEED (FF)
1443                            U+0020 SPACE
1444                         Stay in the before DOCTYPE name state. */
1445
1446                     } elseif($char === '>') {
1447                         /* U+003E GREATER-THAN SIGN (>)
1448                         Parse error. Create a new DOCTYPE token. Set its
1449                         force-quirks flag to on. Emit the token. Switch to the
1450                         data state. */
1451                         $this->emitToken(array(
1452                             'type' => self::PARSEERROR,
1453                             'data' => 'expected-doctype-name-but-got-right-bracket'
1454                         ));
1455                         $this->emitToken(array(
1456                             'name' => '',
1457                             'type' => self::DOCTYPE,
1458                             'force-quirks' => true,
1459                             'error' => true
1460                         ));
1461
1462                         $state = 'data';
1463
1464                     } elseif('A' <= $char && $char <= 'Z') {
1465                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1466                         Create a new DOCTYPE token. Set the token's name to the
1467                         lowercase version of the input character (add 0x0020 to
1468                         the character's code point). Switch to the DOCTYPE name
1469                         state. */
1470                         $this->token = array(
1471                             'name' => strtolower($char),
1472                             'type' => self::DOCTYPE,
1473                             'error' => true
1474                         );
1475
1476                         $state = 'doctypeName';
1477
1478                     } elseif($char === false) {
1479                         /* EOF
1480                         Parse error. Create a new DOCTYPE token. Set its
1481                         force-quirks flag to on. Emit the token. Reconsume the
1482                         EOF character in the data state. */
1483                         $this->emitToken(array(
1484                             'type' => self::PARSEERROR,
1485                             'data' => 'expected-doctype-name-but-got-eof'
1486                         ));
1487                         $this->emitToken(array(
1488                             'name' => '',
1489                             'type' => self::DOCTYPE,
1490                             'force-quirks' => true,
1491                             'error' => true
1492                         ));
1493
1494                         $this->stream->unget();
1495                         $state = 'data';
1496
1497                     } else {
1498                         /* Anything else
1499                         Create a new DOCTYPE token. Set the token's name to the
1500                         current input character. Switch to the DOCTYPE name state. */
1501                         $this->token = array(
1502                             'name' => $char,
1503                             'type' => self::DOCTYPE,
1504                             'error' => true
1505                         );
1506
1507                         $state = 'doctypeName';
1508                     }
1509                 break;
1510
1511                 case 'doctypeName':
1512                     /* Consume the next input character: */
1513                     $char = $this->stream->char();
1514
1515                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1516                         /* U+0009 CHARACTER TABULATION
1517                            U+000A LINE FEED (LF)
1518                            U+000C FORM FEED (FF)
1519                            U+0020 SPACE
1520                         Switch to the after DOCTYPE name state. */
1521                         $state = 'afterDoctypeName';
1522
1523                     } elseif($char === '>') {
1524                         /* U+003E GREATER-THAN SIGN (>)
1525                         Emit the current DOCTYPE token. Switch to the data state. */
1526                         $this->emitToken($this->token);
1527                         $state = 'data';
1528
1529                     } elseif('A' <= $char && $char <= 'Z') {
1530                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1531                         Append the lowercase version of the input character
1532                         (add 0x0020 to the character's code point) to the current
1533                         DOCTYPE token's name. Stay in the DOCTYPE name state. */
1534                         $this->token['name'] .= strtolower($char);
1535
1536                     } elseif($char === false) {
1537                         /* EOF
1538                         Parse error. Set the DOCTYPE token's force-quirks flag
1539                         to on. Emit that DOCTYPE token. Reconsume the EOF
1540                         character in the data state. */
1541                         $this->emitToken(array(
1542                             'type' => self::PARSEERROR,
1543                             'data' => 'eof-in-doctype-name'
1544                         ));
1545                         $this->token['force-quirks'] = true;
1546                         $this->emitToken($this->token);
1547                         $this->stream->unget();
1548                         $state = 'data';
1549
1550                     } else {
1551                         /* Anything else
1552                         Append the current input character to the current
1553                         DOCTYPE token's name. Stay in the DOCTYPE name state. */
1554                         $this->token['name'] .= $char;
1555                     }
1556
1557                     // XXX this is probably some sort of quirks mode designation,
1558                     // check tree-builder to be sure. In general 'error' needs
1559                     // to be specc'ified, this probably means removing it at the end
1560                     $this->token['error'] = ($this->token['name'] === 'HTML')
1561                         ? false
1562                         : true;
1563                 break;
1564
1565                 case 'afterDoctypeName':
1566                     /* Consume the next input character: */
1567                     $char = $this->stream->char();
1568
1569                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1570                         /* U+0009 CHARACTER TABULATION
1571                            U+000A LINE FEED (LF)
1572                            U+000C FORM FEED (FF)
1573                            U+0020 SPACE
1574                         Stay in the after DOCTYPE name state. */
1575
1576                     } elseif($char === '>') {
1577                         /* U+003E GREATER-THAN SIGN (>)
1578                         Emit the current DOCTYPE token. Switch to the data state. */
1579                         $this->emitToken($this->token);
1580                         $state = 'data';
1581
1582                     } elseif($char === false) {
1583                         /* EOF
1584                         Parse error. Set the DOCTYPE token's force-quirks flag
1585                         to on. Emit that DOCTYPE token. Reconsume the EOF
1586                         character in the data state. */
1587                         $this->emitToken(array(
1588                             'type' => self::PARSEERROR,
1589                             'data' => 'eof-in-doctype'
1590                         ));
1591                         $this->token['force-quirks'] = true;
1592                         $this->emitToken($this->token);
1593                         $this->stream->unget();
1594                         $state = 'data';
1595
1596                     } else {
1597                         /* Anything else */
1598
1599                         $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
1600                         if ($nextSix === 'PUBLIC') {
1601                             /* If the next six characters are an ASCII
1602                             case-insensitive match for the word "PUBLIC", then
1603                             consume those characters and switch to the before
1604                             DOCTYPE public identifier state. */
1605                             $state = 'beforeDoctypePublicIdentifier';
1606
1607                         } elseif ($nextSix === 'SYSTEM') {
1608                             /* Otherwise, if the next six characters are an ASCII
1609                             case-insensitive match for the word "SYSTEM", then
1610                             consume those characters and switch to the before
1611                             DOCTYPE system identifier state. */
1612                             $state = 'beforeDoctypeSystemIdentifier';
1613
1614                         } else {
1615                             /* Otherwise, this is the parse error. Set the DOCTYPE
1616                             token's force-quirks flag to on. Switch to the bogus
1617                             DOCTYPE state. */
1618                             $this->emitToken(array(
1619                                 'type' => self::PARSEERROR,
1620                                 'data' => 'expected-space-or-right-bracket-in-doctype'
1621                             ));
1622                             $this->token['force-quirks'] = true;
1623                             $this->token['error'] = true;
1624                             $state = 'bogusDoctype';
1625                         }
1626                     }
1627                 break;
1628
1629                 case 'beforeDoctypePublicIdentifier':
1630                     /* Consume the next input character: */
1631                     $char = $this->stream->char();
1632
1633                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1634                         /* U+0009 CHARACTER TABULATION
1635                            U+000A LINE FEED (LF)
1636                            U+000C FORM FEED (FF)
1637                            U+0020 SPACE
1638                         Stay in the before DOCTYPE public identifier state. */
1639                     } elseif ($char === '"') {
1640                         /* U+0022 QUOTATION MARK (")
1641                         Set the DOCTYPE token's public identifier to the empty
1642                         string (not missing), then switch to the DOCTYPE public
1643                         identifier (double-quoted) state. */
1644                         $this->token['public'] = '';
1645                         $state = 'doctypePublicIdentifierDoubleQuoted';
1646                     } elseif ($char === "'") {
1647                         /* U+0027 APOSTROPHE (')
1648                         Set the DOCTYPE token's public identifier to the empty
1649                         string (not missing), then switch to the DOCTYPE public
1650                         identifier (single-quoted) state. */
1651                         $this->token['public'] = '';
1652                         $state = 'doctypePublicIdentifierSingleQuoted';
1653                     } elseif ($char === '>') {
1654                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1655                         to on. Emit that DOCTYPE token. Switch to the data state. */
1656                         $this->emitToken(array(
1657                             'type' => self::PARSEERROR,
1658                             'data' => 'unexpected-end-of-doctype'
1659                         ));
1660                         $this->token['force-quirks'] = true;
1661                         $this->emitToken($this->token);
1662                         $state = 'data';
1663                     } elseif ($char === false) {
1664                         /* Parse error. Set the DOCTYPE token's force-quirks
1665                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1666                         character in the data state. */
1667                         $this->emitToken(array(
1668                             'type' => self::PARSEERROR,
1669                             'data' => 'eof-in-doctype'
1670                         ));
1671                         $this->token['force-quirks'] = true;
1672                         $this->emitToken($this->token);
1673                         $this->stream->unget();
1674                         $state = 'data';
1675                     } else {
1676                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1677                         to on. Switch to the bogus DOCTYPE state. */
1678                         $this->emitToken(array(
1679                             'type' => self::PARSEERROR,
1680                             'data' => 'unexpected-char-in-doctype'
1681                         ));
1682                         $this->token['force-quirks'] = true;
1683                         $state = 'bogusDoctype';
1684                     }
1685                 break;
1686
1687                 case 'doctypePublicIdentifierDoubleQuoted':
1688                     /* Consume the next input character: */
1689                     $char = $this->stream->char();
1690
1691                     if ($char === '"') {
1692                         /* U+0022 QUOTATION MARK (")
1693                         Switch to the after DOCTYPE public identifier state. */
1694                         $state = 'afterDoctypePublicIdentifier';
1695                     } elseif ($char === '>') {
1696                         /* U+003E GREATER-THAN SIGN (>)
1697                         Parse error. Set the DOCTYPE token's force-quirks flag
1698                         to on. Emit that DOCTYPE token. Switch to the data state. */
1699                         $this->emitToken(array(
1700                             'type' => self::PARSEERROR,
1701                             'data' => 'unexpected-end-of-doctype'
1702                         ));
1703                         $this->token['force-quirks'] = true;
1704                         $this->emitToken($this->token);
1705                         $state = 'data';
1706                     } elseif ($char === false) {
1707                         /* EOF
1708                         Parse error. Set the DOCTYPE token's force-quirks flag
1709                         to on. Emit that DOCTYPE token. Reconsume the EOF
1710                         character in the data state. */
1711                         $this->emitToken(array(
1712                             'type' => self::PARSEERROR,
1713                             'data' => 'eof-in-doctype'
1714                         ));
1715                         $this->token['force-quirks'] = true;
1716                         $this->emitToken($this->token);
1717                         $this->stream->unget();
1718                         $state = 'data';
1719                     } else {
1720                         /* Anything else
1721                         Append the current input character to the current
1722                         DOCTYPE token's public identifier. Stay in the DOCTYPE
1723                         public identifier (double-quoted) state. */
1724                         $this->token['public'] .= $char;
1725                     }
1726                 break;
1727
1728                 case 'doctypePublicIdentifierSingleQuoted':
1729                     /* Consume the next input character: */
1730                     $char = $this->stream->char();
1731
1732                     if ($char === "'") {
1733                         /* U+0027 APOSTROPHE (')
1734                         Switch to the after DOCTYPE public identifier state. */
1735                         $state = 'afterDoctypePublicIdentifier';
1736                     } elseif ($char === '>') {
1737                         /* U+003E GREATER-THAN SIGN (>)
1738                         Parse error. Set the DOCTYPE token's force-quirks flag
1739                         to on. Emit that DOCTYPE token. Switch to the data state. */
1740                         $this->emitToken(array(
1741                             'type' => self::PARSEERROR,
1742                             'data' => 'unexpected-end-of-doctype'
1743                         ));
1744                         $this->token['force-quirks'] = true;
1745                         $this->emitToken($this->token);
1746                         $state = 'data';
1747                     } elseif ($char === false) {
1748                         /* EOF
1749                         Parse error. Set the DOCTYPE token's force-quirks flag
1750                         to on. Emit that DOCTYPE token. Reconsume the EOF
1751                         character in the data state. */
1752                         $this->emitToken(array(
1753                             'type' => self::PARSEERROR,
1754                             'data' => 'eof-in-doctype'
1755                         ));
1756                         $this->token['force-quirks'] = true;
1757                         $this->emitToken($this->token);
1758                         $this->stream->unget();
1759                         $state = 'data';
1760                     } else {
1761                         /* Anything else
1762                         Append the current input character to the current
1763                         DOCTYPE token's public identifier. Stay in the DOCTYPE
1764                         public identifier (double-quoted) state. */
1765                         $this->token['public'] .= $char;
1766                     }
1767                 break;
1768
1769                 case 'afterDoctypePublicIdentifier':
1770                     /* Consume the next input character: */
1771                     $char = $this->stream->char();
1772
1773                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1774                         /* U+0009 CHARACTER TABULATION
1775                            U+000A LINE FEED (LF)
1776                            U+000C FORM FEED (FF)
1777                            U+0020 SPACE
1778                         Stay in the after DOCTYPE public identifier state. */
1779                     } elseif ($char === '"') {
1780                         /* U+0022 QUOTATION MARK (")
1781                         Set the DOCTYPE token's system identifier to the
1782                         empty string (not missing), then switch to the DOCTYPE
1783                         system identifier (double-quoted) state. */
1784                         $this->token['system'] = '';
1785                         $state = 'doctypeSystemIdentifierDoubleQuoted';
1786                     } elseif ($char === "'") {
1787                         /* U+0027 APOSTROPHE (')
1788                         Set the DOCTYPE token's system identifier to the
1789                         empty string (not missing), then switch to the DOCTYPE
1790                         system identifier (single-quoted) state. */
1791                         $this->token['system'] = '';
1792                         $state = 'doctypeSystemIdentifierSingleQuoted';
1793                     } elseif ($char === '>') {
1794                         /* U+003E GREATER-THAN SIGN (>)
1795                         Emit the current DOCTYPE token. Switch to the data state. */
1796                         $this->emitToken($this->token);
1797                         $state = 'data';
1798                     } elseif ($char === false) {
1799                         /* Parse error. Set the DOCTYPE token's force-quirks
1800                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1801                         character in the data state. */
1802                         $this->emitToken(array(
1803                             'type' => self::PARSEERROR,
1804                             'data' => 'eof-in-doctype'
1805                         ));
1806                         $this->token['force-quirks'] = true;
1807                         $this->emitToken($this->token);
1808                         $this->stream->unget();
1809                         $state = 'data';
1810                     } else {
1811                         /* Anything else
1812                         Parse error. Set the DOCTYPE token's force-quirks flag
1813                         to on. Switch to the bogus DOCTYPE state. */
1814                         $this->emitToken(array(
1815                             'type' => self::PARSEERROR,
1816                             'data' => 'unexpected-char-in-doctype'
1817                         ));
1818                         $this->token['force-quirks'] = true;
1819                         $state = 'bogusDoctype';
1820                     }
1821                 break;
1822
1823                 case 'beforeDoctypeSystemIdentifier':
1824                     /* Consume the next input character: */
1825                     $char = $this->stream->char();
1826
1827                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1828                         /* U+0009 CHARACTER TABULATION
1829                            U+000A LINE FEED (LF)
1830                            U+000C FORM FEED (FF)
1831                            U+0020 SPACE
1832                         Stay in the before DOCTYPE system identifier state. */
1833                     } elseif ($char === '"') {
1834                         /* U+0022 QUOTATION MARK (")
1835                         Set the DOCTYPE token's system identifier to the empty
1836                         string (not missing), then switch to the DOCTYPE system
1837                         identifier (double-quoted) state. */
1838                         $this->token['system'] = '';
1839                         $state = 'doctypeSystemIdentifierDoubleQuoted';
1840                     } elseif ($char === "'") {
1841                         /* U+0027 APOSTROPHE (')
1842                         Set the DOCTYPE token's system identifier to the empty
1843                         string (not missing), then switch to the DOCTYPE system
1844                         identifier (single-quoted) state. */
1845                         $this->token['system'] = '';
1846                         $state = 'doctypeSystemIdentifierSingleQuoted';
1847                     } elseif ($char === '>') {
1848                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1849                         to on. Emit that DOCTYPE token. Switch to the data state. */
1850                         $this->emitToken(array(
1851                             'type' => self::PARSEERROR,
1852                             'data' => 'unexpected-char-in-doctype'
1853                         ));
1854                         $this->token['force-quirks'] = true;
1855                         $this->emitToken($this->token);
1856                         $state = 'data';
1857                     } elseif ($char === false) {
1858                         /* Parse error. Set the DOCTYPE token's force-quirks
1859                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1860                         character in the data state. */
1861                         $this->emitToken(array(
1862                             'type' => self::PARSEERROR,
1863                             'data' => 'eof-in-doctype'
1864                         ));
1865                         $this->token['force-quirks'] = true;
1866                         $this->emitToken($this->token);
1867                         $this->stream->unget();
1868                         $state = 'data';
1869                     } else {
1870                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1871                         to on. Switch to the bogus DOCTYPE state. */
1872                         $this->emitToken(array(
1873                             'type' => self::PARSEERROR,
1874                             'data' => 'unexpected-char-in-doctype'
1875                         ));
1876                         $this->token['force-quirks'] = true;
1877                         $state = 'bogusDoctype';
1878                     }
1879                 break;
1880
1881                 case 'doctypeSystemIdentifierDoubleQuoted':
1882                     /* Consume the next input character: */
1883                     $char = $this->stream->char();
1884
1885                     if ($char === '"') {
1886                         /* U+0022 QUOTATION MARK (")
1887                         Switch to the after DOCTYPE system identifier state. */
1888                         $state = 'afterDoctypeSystemIdentifier';
1889                     } elseif ($char === '>') {
1890                         /* U+003E GREATER-THAN SIGN (>)
1891                         Parse error. Set the DOCTYPE token's force-quirks flag
1892                         to on. Emit that DOCTYPE token. Switch to the data state. */
1893                         $this->emitToken(array(
1894                             'type' => self::PARSEERROR,
1895                             'data' => 'unexpected-end-of-doctype'
1896                         ));
1897                         $this->token['force-quirks'] = true;
1898                         $this->emitToken($this->token);
1899                         $state = 'data';
1900                     } elseif ($char === false) {
1901                         /* EOF
1902                         Parse error. Set the DOCTYPE token's force-quirks flag
1903                         to on. Emit that DOCTYPE token. Reconsume the EOF
1904                         character in the data state. */
1905                         $this->emitToken(array(
1906                             'type' => self::PARSEERROR,
1907                             'data' => 'eof-in-doctype'
1908                         ));
1909                         $this->token['force-quirks'] = true;
1910                         $this->emitToken($this->token);
1911                         $this->stream->unget();
1912                         $state = 'data';
1913                     } else {
1914                         /* Anything else
1915                         Append the current input character to the current
1916                         DOCTYPE token's system identifier. Stay in the DOCTYPE
1917                         system identifier (double-quoted) state. */
1918                         $this->token['system'] .= $char;
1919                     }
1920                 break;
1921
1922                 case 'doctypeSystemIdentifierSingleQuoted':
1923                     /* Consume the next input character: */
1924                     $char = $this->stream->char();
1925
1926                     if ($char === "'") {
1927                         /* U+0027 APOSTROPHE (')
1928                         Switch to the after DOCTYPE system identifier state. */
1929                         $state = 'afterDoctypeSystemIdentifier';
1930                     } elseif ($char === '>') {
1931                         /* U+003E GREATER-THAN SIGN (>)
1932                         Parse error. Set the DOCTYPE token's force-quirks flag
1933                         to on. Emit that DOCTYPE token. Switch to the data state. */
1934                         $this->emitToken(array(
1935                             'type' => self::PARSEERROR,
1936                             'data' => 'unexpected-end-of-doctype'
1937                         ));
1938                         $this->token['force-quirks'] = true;
1939                         $this->emitToken($this->token);
1940                         $state = 'data';
1941                     } elseif ($char === false) {
1942                         /* EOF
1943                         Parse error. Set the DOCTYPE token's force-quirks flag
1944                         to on. Emit that DOCTYPE token. Reconsume the EOF
1945                         character in the data state. */
1946                         $this->emitToken(array(
1947                             'type' => self::PARSEERROR,
1948                             'data' => 'eof-in-doctype'
1949                         ));
1950                         $this->token['force-quirks'] = true;
1951                         $this->emitToken($this->token);
1952                         $this->stream->unget();
1953                         $state = 'data';
1954                     } else {
1955                         /* Anything else
1956                         Append the current input character to the current
1957                         DOCTYPE token's system identifier. Stay in the DOCTYPE
1958                         system identifier (double-quoted) state. */
1959                         $this->token['system'] .= $char;
1960                     }
1961                 break;
1962
1963                 case 'afterDoctypeSystemIdentifier':
1964                     /* Consume the next input character: */
1965                     $char = $this->stream->char();
1966
1967                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1968                         /* U+0009 CHARACTER TABULATION
1969                            U+000A LINE FEED (LF)
1970                            U+000C FORM FEED (FF)
1971                            U+0020 SPACE
1972                         Stay in the after DOCTYPE system identifier state. */
1973                     } elseif ($char === '>') {
1974                         /* U+003E GREATER-THAN SIGN (>)
1975                         Emit the current DOCTYPE token. Switch to the data state. */
1976                         $this->emitToken($this->token);
1977                         $state = 'data';
1978                     } elseif ($char === false) {
1979                         /* Parse error. Set the DOCTYPE token's force-quirks
1980                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1981                         character in the data state. */
1982                         $this->emitToken(array(
1983                             'type' => self::PARSEERROR,
1984                             'data' => 'eof-in-doctype'
1985                         ));
1986                         $this->token['force-quirks'] = true;
1987                         $this->emitToken($this->token);
1988                         $this->stream->unget();
1989                         $state = 'data';
1990                     } else {
1991                         /* Anything else
1992                         Parse error. Switch to the bogus DOCTYPE state.
1993                         (This does not set the DOCTYPE token's force-quirks
1994                         flag to on.) */
1995                         $this->emitToken(array(
1996                             'type' => self::PARSEERROR,
1997                             'data' => 'unexpected-char-in-doctype'
1998                         ));
1999                         $state = 'bogusDoctype';
2000                     }
2001                 break;
2002
2003                 case 'bogusDoctype':
2004                     /* Consume the next input character: */
2005                     $char = $this->stream->char();
2006
2007                     if ($char === '>') {
2008                         /* U+003E GREATER-THAN SIGN (>)
2009                         Emit the DOCTYPE token. Switch to the data state. */
2010                         $this->emitToken($this->token);
2011                         $state = 'data';
2012
2013                     } elseif($char === false) {
2014                         /* EOF
2015                         Emit the DOCTYPE token. Reconsume the EOF character in
2016                         the data state. */
2017                         $this->emitToken($this->token);
2018                         $this->stream->unget();
2019                         $state = 'data';
2020
2021                     } else {
2022                         /* Anything else
2023                         Stay in the bogus DOCTYPE state. */
2024                     }
2025                 break;
2026
2027                 // case 'cdataSection':
2028
2029             }
2030         }
2031     }
2032
2033     /**
2034      * Returns a serialized representation of the tree.
2035      */
2036     public function save() {
2037         return $this->tree->save();
2038     }
2039
2040     /**
2041      * Returns the input stream.
2042      */
2043     public function stream() {
2044         return $this->stream;
2045     }
2046
2047     private function consumeCharacterReference($allowed = false, $inattr = false) {
2048         // This goes quite far against spec, and is far closer to the Python
2049         // impl., mainly because we don't do the large unconsuming the spec
2050         // requires.
2051
2052         // All consumed characters.
2053         $chars = $this->stream->char();
2054
2055         /* This section defines how to consume a character
2056         reference. This definition is used when parsing character
2057         references in text and in attributes.
2058
2059         The behavior depends on the identity of the next character
2060         (the one immediately after the U+0026 AMPERSAND character): */
2061
2062         if (
2063             $chars[0] === "\x09" ||
2064             $chars[0] === "\x0A" ||
2065             $chars[0] === "\x0C" ||
2066             $chars[0] === "\x20" ||
2067             $chars[0] === '<' ||
2068             $chars[0] === '&' ||
2069             $chars === false ||
2070             $chars[0] === $allowed
2071         ) {
2072             /* U+0009 CHARACTER TABULATION
2073                U+000A LINE FEED (LF)
2074                U+000C FORM FEED (FF)
2075                U+0020 SPACE
2076                U+003C LESS-THAN SIGN
2077                U+0026 AMPERSAND
2078                EOF
2079                The additional allowed character, if there is one
2080             Not a character reference. No characters are consumed,
2081             and nothing is returned. (This is not an error, either.) */
2082             // We already consumed, so unconsume.
2083             $this->stream->unget();
2084             return '&';
2085         } elseif ($chars[0] === '#') {
2086             /* Consume the U+0023 NUMBER SIGN. */
2087             // Um, yeah, we already did that.
2088             /* The behavior further depends on the character after
2089             the U+0023 NUMBER SIGN: */
2090             $chars .= $this->stream->char();
2091             if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
2092                 /* U+0078 LATIN SMALL LETTER X
2093                    U+0058 LATIN CAPITAL LETTER X */
2094                 /* Consume the X. */
2095                 // Um, yeah, we already did that.
2096                 /* Follow the steps below, but using the range of
2097                 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2098                 NINE, U+0061 LATIN SMALL LETTER A through to U+0066
2099                 LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
2100                 A, through to U+0046 LATIN CAPITAL LETTER F (in other
2101                 words, 0123456789, ABCDEF, abcdef). */
2102                 $char_class = self::HEX;
2103                 /* When it comes to interpreting the
2104                 number, interpret it as a hexadecimal number. */
2105                 $hex = true;
2106             } else {
2107                 /* Anything else */
2108                 // Unconsume because we shouldn't have consumed this.
2109                 $chars = $chars[0];
2110                 $this->stream->unget();
2111                 /* Follow the steps below, but using the range of
2112                 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2113                 NINE (i.e. just 0123456789). */
2114                 $char_class = self::DIGIT;
2115                 /* When it comes to interpreting the number,
2116                 interpret it as a decimal number. */
2117                 $hex = false;
2118             }
2119
2120             /* Consume as many characters as match the range of characters given above. */
2121             $consumed = $this->stream->charsWhile($char_class);
2122             if ($consumed === '' || $consumed === false) {
2123                 /* If no characters match the range, then don't consume
2124                 any characters (and unconsume the U+0023 NUMBER SIGN
2125                 character and, if appropriate, the X character). This
2126                 is a parse error; nothing is returned. */
2127                 $this->emitToken(array(
2128                     'type' => self::PARSEERROR,
2129                     'data' => 'expected-numeric-entity'
2130                 ));
2131                 return '&' . $chars;
2132             } else {
2133                 /* Otherwise, if the next character is a U+003B SEMICOLON,
2134                 consume that too. If it isn't, there is a parse error. */
2135                 if ($this->stream->char() !== ';') {
2136                     $this->stream->unget();
2137                     $this->emitToken(array(
2138                         'type' => self::PARSEERROR,
2139                         'data' => 'numeric-entity-without-semicolon'
2140                     ));
2141                 }
2142
2143                 /* If one or more characters match the range, then take
2144                 them all and interpret the string of characters as a number
2145                 (either hexadecimal or decimal as appropriate). */
2146                 $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
2147
2148                 /* If that number is one of the numbers in the first column
2149                 of the following table, then this is a parse error. Find the
2150                 row with that number in the first column, and return a
2151                 character token for the Unicode character given in the
2152                 second column of that row. */
2153                 $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
2154                 if ($new_codepoint) {
2155                     $this->emitToken(array(
2156                         'type' => self::PARSEERROR,
2157                         'data' => 'illegal-windows-1252-entity'
2158                     ));
2159                     $codepoint = $new_codepoint;
2160                 } else {
2161                     /* Otherwise, if the number is in the range 0x0000 to 0x0008,
2162                     U+000B,  U+000E to 0x001F,  0x007F  to 0x009F, 0xD800 to 0xDFFF ,
2163                     0xFDD0 to 0xFDEF, or is one of 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF,
2164                     0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
2165                     0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF,
2166                     0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE,
2167                     0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
2168                     0x10FFFE, or 0x10FFFF, or is higher than 0x10FFFF, then this
2169                     is a parse error; return a character token for the U+FFFD
2170                     REPLACEMENT CHARACTER character instead. */
2171                     // && has higher precedence than ||
2172                     if (
2173                         $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
2174                         $codepoint === 0x000B ||
2175                         $codepoint >= 0x000E && $codepoint <= 0x001F ||
2176                         $codepoint >= 0x007F && $codepoint <= 0x009F ||
2177                         $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
2178                         $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
2179                         ($codepoint & 0xFFFE) === 0xFFFE ||
2180                         $codepoint > 0x10FFFF
2181                     ) {
2182                         $this->emitToken(array(
2183                             'type' => self::PARSEERROR,
2184                             'data' => 'illegal-codepoint-for-numeric-entity'
2185                         ));
2186                         $codepoint = 0xFFFD;
2187                     }
2188                 }
2189
2190                 /* Otherwise, return a character token for the Unicode
2191                 character whose code point is that number. */
2192                 return HTML5_Data::utf8chr($codepoint);
2193             }
2194
2195         } else {
2196             /* Anything else */
2197
2198             /* Consume the maximum number of characters possible,
2199             with the consumed characters matching one of the
2200             identifiers in the first column of the named character
2201             references table (in a case-sensitive manner). */
2202
2203             // we will implement this by matching the longest
2204             // alphanumeric + semicolon string, and then working
2205             // our way backwards
2206             $chars .= $this->stream->charsWhile(self::DIGIT . self::ALPHA . ';', HTML5_Data::getNamedCharacterReferenceMaxLength() - 1);
2207             $len = strlen($chars);
2208
2209             $refs = HTML5_Data::getNamedCharacterReferences();
2210             $codepoint = false;
2211             for($c = $len; $c > 0; $c--) {
2212                 $id = substr($chars, 0, $c);
2213                 if(isset($refs[$id])) {
2214                     $codepoint = $refs[$id];
2215                     break;
2216                 }
2217             }
2218
2219             /* If no match can be made, then this is a parse error.
2220             No characters are consumed, and nothing is returned. */
2221             if (!$codepoint) {
2222                 $this->emitToken(array(
2223                     'type' => self::PARSEERROR,
2224                     'data' => 'expected-named-entity'
2225                 ));
2226                 return '&' . $chars;
2227             }
2228
2229             /* If the last character matched is not a U+003B SEMICOLON
2230             (;), there is a parse error. */
2231             $semicolon = true;
2232             if (substr($id, -1) !== ';') {
2233                 $this->emitToken(array(
2234                     'type' => self::PARSEERROR,
2235                     'data' => 'named-entity-without-semicolon'
2236                 ));
2237                 $semicolon = false;
2238             }
2239
2240
2241             /* If the character reference is being consumed as part of
2242             an attribute, and the last character matched is not a
2243             U+003B SEMICOLON (;), and the next character is in the
2244             range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
2245             LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
2246             or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
2247             then, for historical reasons, all the characters that were
2248             matched after the U+0026 AMPERSAND (&) must be unconsumed,
2249             and nothing is returned. */
2250             if (
2251                 $inattr && !$semicolon &&
2252                 strspn(substr($chars, $c, 1), self::ALPHA . self::DIGIT)
2253             ) {
2254                 return '&' . $chars;
2255             }
2256
2257             /* Otherwise, return a character token for the character
2258             corresponding to the character reference name (as given
2259             by the second column of the named character references table). */
2260             return HTML5_Data::utf8chr($codepoint) . substr($chars, $c);
2261         }
2262     }
2263
2264     private function characterReferenceInAttributeValue($allowed = false) {
2265         /* Attempt to consume a character reference. */
2266         $entity = $this->consumeCharacterReference($allowed, true);
2267
2268         /* If nothing is returned, append a U+0026 AMPERSAND
2269         character to the current attribute's value.
2270
2271         Otherwise, append the returned character token to the
2272         current attribute's value. */
2273         $char = (!$entity)
2274             ? '&'
2275             : $entity;
2276
2277         $last = count($this->token['attr']) - 1;
2278         $this->token['attr'][$last]['value'] .= $char;
2279
2280         /* Finally, switch back to the attribute value state that you
2281         were in when were switched into this state. */
2282     }
2283
2284     /**
2285      * Emits a token, passing it on to the tree builder.
2286      */
2287     protected function emitToken($token, $checkStream = true) {
2288         if ($checkStream) {
2289             // Emit errors from input stream.
2290             while ($this->stream->errors) {
2291                 $this->emitToken(array_shift($this->stream->errors), false);
2292             }
2293         }
2294
2295         // the current structure of attributes is not a terribly good one
2296         $this->tree->emitToken($token);
2297
2298         if(is_int($this->tree->content_model)) {
2299             $this->content_model = $this->tree->content_model;
2300             $this->tree->content_model = null;
2301
2302         } elseif($token['type'] === self::ENDTAG) {
2303             $this->content_model = self::PCDATA;
2304         }
2305     }
2306 }
2307