5 Copyright 2007 Jeroen van der Meer <http://jero.net/>
6 Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
7 Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
9 Permission is hereby granted, free of charge, to any person obtaining a
10 copy of this software and associated documentation files (the
11 "Software"), to deal in the Software without restriction, including
12 without limitation the rights to use, copy, modify, merge, publish,
13 distribute, sublicense, and/or sell copies of the Software, and to
14 permit persons to whom the Software is furnished to do so, subject to
15 the following conditions:
17 The above copyright notice and this permission notice shall be included
18 in all copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 // /* */ indicates verbatim text from the HTML 5 specification
32 // // indicates regular comments
34 // all flags are in hyphenated form
36 class HTML5_Tokenizer {
38 * Points to an InputStream object.
43 * Tree builder that the tokenizer emits token to.
48 * Current content model we are parsing as.
50 protected $content_model;
53 * Current token that is being built, but not yet emitted. Also
54 * is the last token emitted, if applicable.
58 // These are constants describing the content model
64 // These are constants describing tokens
65 // XXX should probably be moved somewhere else, probably the
72 const SPACECHARACTER = 5;
76 // These are constants representing bunches of characters.
77 const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
78 const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
79 const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
80 const DIGIT = '0123456789';
81 const HEX = '0123456789ABCDEFabcdef';
82 const WHITESPACE = "\t\n\x0c ";
85 * @param $data Data to parse
87 public function __construct($data, $builder = null) {
88 $this->stream = new HTML5_InputStream($data);
89 if (!$builder) $this->tree = new HTML5_TreeBuilder;
90 $this->content_model = self::PCDATA;
93 public function parseFragment($context = null) {
94 $this->tree->setupContext($context);
95 if ($this->tree->content_model) {
96 $this->content_model = $this->tree->content_model;
97 $this->tree->content_model = null;
102 // XXX maybe convert this into an iterator? regardless, this function
103 // and the save function should go into a Parser facade of some sort
105 * Performs the actual parsing of the document.
107 public function parse() {
110 // This is used to avoid having to have look-behind in the data state.
113 * Escape flag as specified by the HTML5 specification: "used to
114 * control the behavior of the tokeniser. It is either true or
115 * false, and initially must be set to the false state."
119 while($state !== null) {
122 switch ($this->content_model) {
123 case self::PCDATA: echo 'PCDATA'; break;
124 case self::RCDATA: echo 'RCDATA'; break;
125 case self::CDATA: echo 'CDATA'; break;
126 case self::PLAINTEXT: echo 'PLAINTEXT'; break;
128 if ($escape) echo " escape";
134 /* Consume the next input character */
135 $char = $this->stream->char();
136 $lastFourChars .= $char;
137 if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
139 // see below for meaning
143 $this->content_model === self::RCDATA ||
144 $this->content_model === self::CDATA
149 $this->content_model === self::PCDATA ||
150 $this->content_model === self::RCDATA
153 $this->content_model === self::PCDATA ||
156 $this->content_model === self::RCDATA ||
157 $this->content_model === self::CDATA
164 $this->content_model === self::RCDATA ||
165 $this->content_model === self::CDATA
168 if($char === '&' && $amp_cond) {
169 /* U+0026 AMPERSAND (&)
170 When the content model flag is set to one of the PCDATA or RCDATA
171 states and the escape flag is false: switch to the
172 character reference data state. Otherwise: treat it as per
173 the "anything else" entry below. */
174 $state = 'characterReferenceData';
179 $lastFourChars === '<!--'
182 U+002D HYPHEN-MINUS (-)
183 If the content model flag is set to either the RCDATA state or
184 the CDATA state, and the escape flag is false, and there are at
185 least three characters before this one in the input stream, and the
186 last four characters in the input stream, including this one, are
187 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
188 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
191 /* In any case, emit the input character as a character token. Stay
192 in the data state. */
193 $this->emitToken(array(
194 'type' => self::CHARACTER,
197 // We do the "any case" part as part of "anything else".
199 /* U+003C LESS-THAN SIGN (<) */
200 } elseif($char === '<' && $lt_cond) {
201 /* When the content model flag is set to the PCDATA state: switch
202 to the tag open state.
204 When the content model flag is set to either the RCDATA state or
205 the CDATA state and the escape flag is false: switch to the tag
208 Otherwise: treat it as per the "anything else" entry below. */
211 /* U+003E GREATER-THAN SIGN (>) */
215 substr($lastFourChars, 1) === '-->'
217 /* If the content model flag is set to either the RCDATA state or
218 the CDATA state, and the escape flag is true, and the last three
219 characters in the input stream including this one are U+002D
220 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
221 set the escape flag to false. */
224 /* In any case, emit the input character as a character token.
225 Stay in the data state. */
226 $this->emitToken(array(
227 'type' => self::CHARACTER,
230 // We do the "any case" part as part of "anything else".
232 } elseif($char === false) {
234 Emit an end-of-file token. */
236 $this->tree->emitToken(array(
240 } elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
241 // Directly after emitting a token you switch back to the "data
242 // state". At that point spaceCharacters are important so they are
243 // emitted separately.
244 $chars = $this->stream->charsWhile(self::WHITESPACE);
245 $this->emitToken(array(
246 'type' => self::SPACECHARACTER,
247 'data' => $char . $chars
249 $lastFourChars .= $chars;
250 if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
254 THIS IS AN OPTIMIZATION: Get as many character that
255 otherwise would also be treated as a character token and emit it
256 as a single character token. Stay in the data state. */
259 if ($hyp_cond) $mask .= '-';
260 if ($amp_cond) $mask .= '&';
261 if ($lt_cond) $mask .= '<';
262 if ($gt_cond) $mask .= '>';
265 $chars = $this->stream->remainingChars();
267 $chars = $this->stream->charsUntil($mask);
270 $this->emitToken(array(
271 'type' => self::CHARACTER,
272 'data' => $char . $chars
275 $lastFourChars .= $chars;
276 if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
282 case 'characterReferenceData':
283 /* (This cannot happen if the content model flag
284 is set to the CDATA state.) */
286 /* Attempt to consume a character reference, with no
287 additional allowed character. */
288 $entity = $this->consumeCharacterReference();
290 /* If nothing is returned, emit a U+0026 AMPERSAND
291 character token. Otherwise, emit the character token that
293 // This is all done when consuming the character reference.
294 $this->emitToken(array(
295 'type' => self::CHARACTER,
299 /* Finally, switch to the data state. */
304 $char = $this->stream->char();
306 switch($this->content_model) {
309 /* Consume the next input character. If it is a
310 U+002F SOLIDUS (/) character, switch to the close
311 tag open state. Otherwise, emit a U+003C LESS-THAN
312 SIGN character token and reconsume the current input
313 character in the data state. */
314 // We consumed above.
317 $state = 'closeTagOpen';
320 $this->emitToken(array(
321 'type' => self::CHARACTER,
325 $this->stream->unget();
332 /* If the content model flag is set to the PCDATA state
333 Consume the next input character: */
334 // We consumed above.
337 /* U+0021 EXCLAMATION MARK (!)
338 Switch to the markup declaration open state. */
339 $state = 'markupDeclarationOpen';
341 } elseif($char === '/') {
342 /* U+002F SOLIDUS (/)
343 Switch to the close tag open state. */
344 $state = 'closeTagOpen';
346 } elseif('A' <= $char && $char <= 'Z') {
347 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
348 Create a new start tag token, set its tag name to the lowercase
349 version of the input character (add 0x0020 to the character's code
350 point), then switch to the tag name state. (Don't emit the token
351 yet; further details will be filled in before it is emitted.) */
352 $this->token = array(
353 'name' => strtolower($char),
354 'type' => self::STARTTAG,
360 } elseif('a' <= $char && $char <= 'z') {
361 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
362 Create a new start tag token, set its tag name to the input
363 character, then switch to the tag name state. (Don't emit
364 the token yet; further details will be filled in before it
366 $this->token = array(
368 'type' => self::STARTTAG,
374 } elseif($char === '>') {
375 /* U+003E GREATER-THAN SIGN (>)
376 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
377 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
378 $this->emitToken(array(
379 'type' => self::PARSEERROR,
380 'data' => 'expected-tag-name-but-got-right-bracket'
382 $this->emitToken(array(
383 'type' => self::CHARACTER,
389 } elseif($char === '?') {
390 /* U+003F QUESTION MARK (?)
391 Parse error. Switch to the bogus comment state. */
392 $this->emitToken(array(
393 'type' => self::PARSEERROR,
394 'data' => 'expected-tag-name-but-got-question-mark'
396 $this->token = array(
398 'type' => self::COMMENT
400 $state = 'bogusComment';
404 Parse error. Emit a U+003C LESS-THAN SIGN character token and
405 reconsume the current input character in the data state. */
406 $this->emitToken(array(
407 'type' => self::PARSEERROR,
408 'data' => 'expected-tag-name'
410 $this->emitToken(array(
411 'type' => self::CHARACTER,
416 $this->stream->unget();
424 $this->content_model === self::RCDATA ||
425 $this->content_model === self::CDATA
427 /* If the content model flag is set to the RCDATA or CDATA
429 $name = strtolower($this->stream->charsWhile(self::ALPHA));
430 $following = $this->stream->char();
431 $this->stream->unget();
434 $this->token['name'] !== $name ||
435 $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
437 /* if no start tag token has ever been emitted by this instance
438 of the tokenizer (fragment case), or, if the next few
439 characters do not match the tag name of the last start tag
440 token emitted (compared in an ASCII case-insensitive manner),
441 or if they do but they are not immediately followed by one of
442 the following characters:
444 * U+0009 CHARACTER TABULATION
445 * U+000A LINE FEED (LF)
446 * U+000C FORM FEED (FF)
448 * U+003E GREATER-THAN SIGN (>)
452 ...then emit a U+003C LESS-THAN SIGN character token, a
453 U+002F SOLIDUS character token, and switch to the data
454 state to process the next input character. */
455 // XXX: Probably ought to replace in_array with $following === x ||...
457 // We also need to emit $name now we've consumed that, as we
458 // know it'll just be emitted as a character token.
459 $this->emitToken(array(
460 'type' => self::CHARACTER,
461 'data' => '</' . $name
466 // This matches what would happen if we actually did the
467 // otherwise below (but we can't because we've consumed too
470 // Start the end tag token with the name we already have.
471 $this->token = array(
473 'type' => self::ENDTAG
476 // Change to tag name state.
479 } elseif ($this->content_model === self::PCDATA) {
480 /* Otherwise, if the content model flag is set to the PCDATA
482 $char = $this->stream->char();
484 if ('A' <= $char && $char <= 'Z') {
485 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
486 Create a new end tag token, set its tag name to the lowercase version
487 of the input character (add 0x0020 to the character's code point), then
488 switch to the tag name state. (Don't emit the token yet; further details
489 will be filled in before it is emitted.) */
490 $this->token = array(
491 'name' => strtolower($char),
492 'type' => self::ENDTAG
497 } elseif ('a' <= $char && $char <= 'z') {
498 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
499 Create a new end tag token, set its tag name to the
500 input character, then switch to the tag name state.
501 (Don't emit the token yet; further details will be
502 filled in before it is emitted.) */
503 $this->token = array(
505 'type' => self::ENDTAG
510 } elseif($char === '>') {
511 /* U+003E GREATER-THAN SIGN (>)
512 Parse error. Switch to the data state. */
513 $this->emitToken(array(
514 'type' => self::PARSEERROR,
515 'data' => 'expected-closing-tag-but-got-right-bracket'
519 } elseif($char === false) {
521 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
522 SOLIDUS character token. Reconsume the EOF character in the data state. */
523 $this->emitToken(array(
524 'type' => self::PARSEERROR,
525 'data' => 'expected-closing-tag-but-got-eof'
527 $this->emitToken(array(
528 'type' => self::CHARACTER,
532 $this->stream->unget();
536 /* Parse error. Switch to the bogus comment state. */
537 $this->emitToken(array(
538 'type' => self::PARSEERROR,
539 'data' => 'expected-closing-tag-but-got-char'
541 $this->token = array(
543 'type' => self::COMMENT
545 $state = 'bogusComment';
551 /* Consume the next input character: */
552 $char = $this->stream->char();
554 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
555 /* U+0009 CHARACTER TABULATION
556 U+000A LINE FEED (LF)
557 U+000C FORM FEED (FF)
559 Switch to the before attribute name state. */
560 $state = 'beforeAttributeName';
562 } elseif($char === '/') {
563 /* U+002F SOLIDUS (/)
564 Switch to the self-closing start tag state. */
565 $state = 'selfClosingStartTag';
567 } elseif($char === '>') {
568 /* U+003E GREATER-THAN SIGN (>)
569 Emit the current tag token. Switch to the data state. */
570 $this->emitToken($this->token);
573 } elseif('A' <= $char && $char <= 'Z') {
574 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
575 Append the lowercase version of the current input
576 character (add 0x0020 to the character's code point) to
577 the current tag token's tag name. Stay in the tag name state. */
578 $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
580 $this->token['name'] .= strtolower($char . $chars);
583 } elseif($char === false) {
585 Parse error. Emit the current tag token. Reconsume the EOF
586 character in the data state. */
587 $this->emitToken(array(
588 'type' => self::PARSEERROR,
589 'data' => 'eof-in-tag-name'
591 $this->emitToken($this->token);
593 $this->stream->unget();
598 Append the current input character to the current tag token's tag name.
599 Stay in the tag name state. */
600 $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
602 $this->token['name'] .= $char . $chars;
607 case 'beforeAttributeName':
608 /* Consume the next input character: */
609 $char = $this->stream->char();
611 // this conditional is optimized, check bottom
612 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
613 /* U+0009 CHARACTER TABULATION
614 U+000A LINE FEED (LF)
615 U+000C FORM FEED (FF)
617 Stay in the before attribute name state. */
618 $state = 'beforeAttributeName';
620 } elseif($char === '/') {
621 /* U+002F SOLIDUS (/)
622 Switch to the self-closing start tag state. */
623 $state = 'selfClosingStartTag';
625 } elseif($char === '>') {
626 /* U+003E GREATER-THAN SIGN (>)
627 Emit the current tag token. Switch to the data state. */
628 $this->emitToken($this->token);
631 } elseif('A' <= $char && $char <= 'Z') {
632 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
633 Start a new attribute in the current tag token. Set that
634 attribute's name to the lowercase version of the current
635 input character (add 0x0020 to the character's code
636 point), and its value to the empty string. Switch to the
637 attribute name state.*/
638 $this->token['attr'][] = array(
639 'name' => strtolower($char),
643 $state = 'attributeName';
645 } elseif($char === false) {
647 Parse error. Emit the current tag token. Reconsume the EOF
648 character in the data state. */
649 $this->emitToken(array(
650 'type' => self::PARSEERROR,
651 'data' => 'expected-attribute-name-but-got-eof'
653 $this->emitToken($this->token);
655 $this->stream->unget();
659 /* U+0022 QUOTATION MARK (")
660 U+0027 APOSTROPHE (')
661 U+003D EQUALS SIGN (=)
662 Parse error. Treat it as per the "anything else" entry
664 if($char === '"' || $char === "'" || $char === '=') {
665 $this->emitToken(array(
666 'type' => self::PARSEERROR,
667 'data' => 'invalid-character-in-attribute-name'
672 Start a new attribute in the current tag token. Set that attribute's
673 name to the current input character, and its value to the empty string.
674 Switch to the attribute name state. */
675 $this->token['attr'][] = array(
680 $state = 'attributeName';
684 case 'attributeName':
685 // Consume the next input character:
686 $char = $this->stream->char();
688 // this conditional is optimized, check bottom
689 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
690 /* U+0009 CHARACTER TABULATION
691 U+000A LINE FEED (LF)
692 U+000C FORM FEED (FF)
694 Switch to the after attribute name state. */
695 $state = 'afterAttributeName';
697 } elseif($char === '/') {
698 /* U+002F SOLIDUS (/)
699 Switch to the self-closing start tag state. */
700 $state = 'selfClosingStartTag';
702 } elseif($char === '=') {
703 /* U+003D EQUALS SIGN (=)
704 Switch to the before attribute value state. */
705 $state = 'beforeAttributeValue';
707 } elseif($char === '>') {
708 /* U+003E GREATER-THAN SIGN (>)
709 Emit the current tag token. Switch to the data state. */
710 $this->emitToken($this->token);
713 } elseif('A' <= $char && $char <= 'Z') {
714 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
715 Append the lowercase version of the current input
716 character (add 0x0020 to the character's code point) to
717 the current attribute's name. Stay in the attribute name
719 $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
721 $last = count($this->token['attr']) - 1;
722 $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
724 $state = 'attributeName';
726 } elseif($char === false) {
728 Parse error. Emit the current tag token. Reconsume the EOF
729 character in the data state. */
730 $this->emitToken(array(
731 'type' => self::PARSEERROR,
732 'data' => 'eof-in-attribute-name'
734 $this->emitToken($this->token);
736 $this->stream->unget();
740 /* U+0022 QUOTATION MARK (")
741 U+0027 APOSTROPHE (')
742 Parse error. Treat it as per the "anything else"
744 if($char === '"' || $char === "'") {
745 $this->emitToken(array(
746 'type' => self::PARSEERROR,
747 'data' => 'invalid-character-in-attribute-name'
752 Append the current input character to the current attribute's name.
753 Stay in the attribute name state. */
754 $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
756 $last = count($this->token['attr']) - 1;
757 $this->token['attr'][$last]['name'] .= $char . $chars;
759 $state = 'attributeName';
762 /* When the user agent leaves the attribute name state
763 (and before emitting the tag token, if appropriate), the
764 complete attribute's name must be compared to the other
765 attributes on the same token; if there is already an
766 attribute on the token with the exact same name, then this
767 is a parse error and the new attribute must be dropped, along
768 with the value that gets associated with it (if any). */
769 // this might be implemented in the emitToken method
772 case 'afterAttributeName':
773 // Consume the next input character:
774 $char = $this->stream->char();
776 // this is an optimized conditional, check the bottom
777 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
778 /* U+0009 CHARACTER TABULATION
779 U+000A LINE FEED (LF)
780 U+000C FORM FEED (FF)
782 Stay in the after attribute name state. */
783 $state = 'afterAttributeName';
785 } elseif($char === '/') {
786 /* U+002F SOLIDUS (/)
787 Switch to the self-closing start tag state. */
788 $state = 'selfClosingStartTag';
790 } elseif($char === '=') {
791 /* U+003D EQUALS SIGN (=)
792 Switch to the before attribute value state. */
793 $state = 'beforeAttributeValue';
795 } elseif($char === '>') {
796 /* U+003E GREATER-THAN SIGN (>)
797 Emit the current tag token. Switch to the data state. */
798 $this->emitToken($this->token);
801 } elseif('A' <= $char && $char <= 'Z') {
802 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
803 Start a new attribute in the current tag token. Set that
804 attribute's name to the lowercase version of the current
805 input character (add 0x0020 to the character's code
806 point), and its value to the empty string. Switch to the
807 attribute name state. */
808 $this->token['attr'][] = array(
809 'name' => strtolower($char),
813 $state = 'attributeName';
815 } elseif($char === false) {
817 Parse error. Emit the current tag token. Reconsume the EOF
818 character in the data state. */
819 $this->emitToken(array(
820 'type' => self::PARSEERROR,
821 'data' => 'expected-end-of-tag-but-got-eof'
823 $this->emitToken($this->token);
825 $this->stream->unget();
829 /* U+0022 QUOTATION MARK (")
830 U+0027 APOSTROPHE (')
831 Parse error. Treat it as per the "anything else"
833 if($char === '"' || $char === "'") {
834 $this->emitToken(array(
835 'type' => self::PARSEERROR,
836 'data' => 'invalid-character-after-attribute-name'
841 Start a new attribute in the current tag token. Set that attribute's
842 name to the current input character, and its value to the empty string.
843 Switch to the attribute name state. */
844 $this->token['attr'][] = array(
849 $state = 'attributeName';
853 case 'beforeAttributeValue':
854 // Consume the next input character:
855 $char = $this->stream->char();
857 // this is an optimized conditional
858 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
859 /* U+0009 CHARACTER TABULATION
860 U+000A LINE FEED (LF)
861 U+000C FORM FEED (FF)
863 Stay in the before attribute value state. */
864 $state = 'beforeAttributeValue';
866 } elseif($char === '"') {
867 /* U+0022 QUOTATION MARK (")
868 Switch to the attribute value (double-quoted) state. */
869 $state = 'attributeValueDoubleQuoted';
871 } elseif($char === '&') {
872 /* U+0026 AMPERSAND (&)
873 Switch to the attribute value (unquoted) state and reconsume
874 this input character. */
875 $this->stream->unget();
876 $state = 'attributeValueUnquoted';
878 } elseif($char === '\'') {
879 /* U+0027 APOSTROPHE (')
880 Switch to the attribute value (single-quoted) state. */
881 $state = 'attributeValueSingleQuoted';
883 } elseif($char === '>') {
884 /* U+003E GREATER-THAN SIGN (>)
885 Parse error. Emit the current tag token. Switch to the data state. */
886 $this->emitToken(array(
887 'type' => self::PARSEERROR,
888 'data' => 'expected-attribute-value-but-got-right-bracket'
890 $this->emitToken($this->token);
893 } elseif($char === false) {
895 Parse error. Emit the current tag token. Reconsume
896 the character in the data state. */
897 $this->emitToken(array(
898 'type' => self::PARSEERROR,
899 'data' => 'expected-attribute-value-but-got-eof'
901 $this->emitToken($this->token);
902 $this->stream->unget();
906 /* U+003D EQUALS SIGN (=)
907 Parse error. Treat it as per the "anything else" entry below. */
909 $this->emitToken(array(
910 'type' => self::PARSEERROR,
911 'data' => 'equals-in-unquoted-attribute-value'
916 Append the current input character to the current attribute's value.
917 Switch to the attribute value (unquoted) state. */
918 $last = count($this->token['attr']) - 1;
919 $this->token['attr'][$last]['value'] .= $char;
921 $state = 'attributeValueUnquoted';
925 case 'attributeValueDoubleQuoted':
926 // Consume the next input character:
927 $char = $this->stream->char();
930 /* U+0022 QUOTATION MARK (")
931 Switch to the after attribute value (quoted) state. */
932 $state = 'afterAttributeValueQuoted';
934 } elseif($char === '&') {
935 /* U+0026 AMPERSAND (&)
936 Switch to the character reference in attribute value
937 state, with the additional allowed character
938 being U+0022 QUOTATION MARK ("). */
939 $this->characterReferenceInAttributeValue('"');
941 } elseif($char === false) {
943 Parse error. Emit the current tag token. Reconsume the character
944 in the data state. */
945 $this->emitToken(array(
946 'type' => self::PARSEERROR,
947 'data' => 'eof-in-attribute-value-double-quote'
949 $this->emitToken($this->token);
951 $this->stream->unget();
956 Append the current input character to the current attribute's value.
957 Stay in the attribute value (double-quoted) state. */
958 $chars = $this->stream->charsUntil('"&');
960 $last = count($this->token['attr']) - 1;
961 $this->token['attr'][$last]['value'] .= $char . $chars;
963 $state = 'attributeValueDoubleQuoted';
967 case 'attributeValueSingleQuoted':
968 // Consume the next input character:
969 $char = $this->stream->char();
972 /* U+0022 QUOTATION MARK (')
973 Switch to the after attribute value state. */
974 $state = 'afterAttributeValueQuoted';
976 } elseif($char === '&') {
977 /* U+0026 AMPERSAND (&)
978 Switch to the entity in attribute value state. */
979 $this->characterReferenceInAttributeValue("'");
981 } elseif($char === false) {
983 Parse error. Emit the current tag token. Reconsume the character
984 in the data state. */
985 $this->emitToken(array(
986 'type' => self::PARSEERROR,
987 'data' => 'eof-in-attribute-value-single-quote'
989 $this->emitToken($this->token);
991 $this->stream->unget();
996 Append the current input character to the current attribute's value.
997 Stay in the attribute value (single-quoted) state. */
998 $chars = $this->stream->charsUntil("'&");
1000 $last = count($this->token['attr']) - 1;
1001 $this->token['attr'][$last]['value'] .= $char . $chars;
1003 $state = 'attributeValueSingleQuoted';
1007 case 'attributeValueUnquoted':
1008 // Consume the next input character:
1009 $char = $this->stream->char();
1011 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1012 /* U+0009 CHARACTER TABULATION
1013 U+000A LINE FEED (LF)
1014 U+000C FORM FEED (FF)
1016 Switch to the before attribute name state. */
1017 $state = 'beforeAttributeName';
1019 } elseif($char === '&') {
1020 /* U+0026 AMPERSAND (&)
1021 Switch to the entity in attribute value state. */
1022 $this->characterReferenceInAttributeValue();
1024 } elseif($char === '>') {
1025 /* U+003E GREATER-THAN SIGN (>)
1026 Emit the current tag token. Switch to the data state. */
1027 $this->emitToken($this->token);
1030 } elseif ($char === false) {
1032 Parse error. Emit the current tag token. Reconsume
1033 the character in the data state. */
1034 $this->emitToken(array(
1035 'type' => self::PARSEERROR,
1036 'data' => 'eof-in-attribute-value-no-quotes'
1038 $this->emitToken($this->token);
1039 $this->stream->unget();
1043 /* U+0022 QUOTATION MARK (")
1044 U+0027 APOSTROPHE (')
1045 U+003D EQUALS SIGN (=)
1046 Parse error. Treat it as per the "anything else"
1048 if($char === '"' || $char === "'" || $char === '=') {
1049 $this->emitToken(array(
1050 'type' => self::PARSEERROR,
1051 'data' => 'unexpected-character-in-unquoted-attribute-value'
1056 Append the current input character to the current attribute's value.
1057 Stay in the attribute value (unquoted) state. */
1058 $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
1060 $last = count($this->token['attr']) - 1;
1061 $this->token['attr'][$last]['value'] .= $char . $chars;
1063 $state = 'attributeValueUnquoted';
1067 case 'afterAttributeValueQuoted':
1068 /* Consume the next input character: */
1069 $char = $this->stream->char();
1071 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1072 /* U+0009 CHARACTER TABULATION
1073 U+000A LINE FEED (LF)
1074 U+000C FORM FEED (FF)
1076 Switch to the before attribute name state. */
1077 $state = 'beforeAttributeName';
1079 } elseif ($char === '/') {
1080 /* U+002F SOLIDUS (/)
1081 Switch to the self-closing start tag state. */
1082 $state = 'selfClosingStartTag';
1084 } elseif ($char === '>') {
1085 /* U+003E GREATER-THAN SIGN (>)
1086 Emit the current tag token. Switch to the data state. */
1087 $this->emitToken($this->token);
1090 } elseif ($char === false) {
1092 Parse error. Emit the current tag token. Reconsume the EOF
1093 character in the data state. */
1094 $this->emitToken(array(
1095 'type' => self::PARSEERROR,
1096 'data' => 'unexpected-EOF-after-attribute-value'
1098 $this->emitToken($this->token);
1099 $this->stream->unget();
1104 Parse error. Reconsume the character in the before attribute
1106 $this->emitToken(array(
1107 'type' => self::PARSEERROR,
1108 'data' => 'unexpected-character-after-attribute-value'
1110 $this->stream->unget();
1111 $state = 'beforeAttributeName';
1115 case 'selfClosingStartTag':
1116 /* Consume the next input character: */
1117 $char = $this->stream->char();
1119 if ($char === '>') {
1120 /* U+003E GREATER-THAN SIGN (>)
1121 Set the self-closing flag of the current tag token.
1122 Emit the current tag token. Switch to the data state. */
1123 // not sure if this is the name we want
1124 $this->token['self-closing'] = true;
1125 /* When an end tag token is emitted with its self-closing flag set,
1126 that is a parse error. */
1127 if ($this->token['type'] === self::ENDTAG) {
1128 $this->emitToken(array(
1129 'type' => self::PARSEERROR,
1130 'data' => 'self-closing-end-tag'
1133 $this->emitToken($this->token);
1136 } elseif ($char === false) {
1138 Parse error. Emit the current tag token. Reconsume the
1139 EOF character in the data state. */
1140 $this->emitToken(array(
1141 'type' => self::PARSEERROR,
1142 'data' => 'unexpected-eof-after-self-closing'
1144 $this->emitToken($this->token);
1145 $this->stream->unget();
1150 Parse error. Reconsume the character in the before attribute name state. */
1151 $this->emitToken(array(
1152 'type' => self::PARSEERROR,
1153 'data' => 'unexpected-character-after-self-closing'
1155 $this->stream->unget();
1156 $state = 'beforeAttributeName';
1160 case 'bogusComment':
1161 /* (This can only happen if the content model flag is set to the PCDATA state.) */
1162 /* Consume every character up to the first U+003E GREATER-THAN SIGN
1163 character (>) or the end of the file (EOF), whichever comes first. Emit
1164 a comment token whose data is the concatenation of all the characters
1165 starting from and including the character that caused the state machine
1166 to switch into the bogus comment state, up to and including the last
1167 consumed character before the U+003E character, if any, or up to the
1168 end of the file otherwise. (If the comment was started by the end of
1169 the file (EOF), the token is empty.) */
1170 $this->token['data'] .= (string) $this->stream->charsUntil('>');
1171 $this->stream->char();
1173 $this->emitToken($this->token);
1175 /* Switch to the data state. */
1179 case 'markupDeclarationOpen':
1180 // Consume for below
1181 $hyphens = $this->stream->charsWhile('-', 2);
1182 if ($hyphens === '-') {
1183 $this->stream->unget();
1185 if ($hyphens !== '--') {
1186 $alpha = $this->stream->charsWhile(self::ALPHA, 7);
1189 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1190 characters, consume those two characters, create a comment token whose
1191 data is the empty string, and switch to the comment state. */
1192 if($hyphens === '--') {
1193 $state = 'commentStart';
1194 $this->token = array(
1196 'type' => self::COMMENT
1199 /* Otherwise if the next seven characters are a case-insensitive match
1200 for the word "DOCTYPE", then consume those characters and switch to the
1202 } elseif(strtoupper($alpha) === 'DOCTYPE') {
1205 // XXX not implemented
1206 /* Otherwise, if the insertion mode is "in foreign content"
1207 and the current node is not an element in the HTML namespace
1208 and the next seven characters are an ASCII case-sensitive
1209 match for the string "[CDATA[" (the five uppercase letters
1210 "CDATA" with a U+005B LEFT SQUARE BRACKET character before
1211 and after), then consume those characters and switch to the
1212 CDATA section state (which is unrelated to the content model
1213 flag's CDATA state). */
1215 /* Otherwise, is is a parse error. Switch to the bogus comment state.
1216 The next character that is consumed, if any, is the first character
1217 that will be in the comment. */
1219 $this->emitToken(array(
1220 'type' => self::PARSEERROR,
1221 'data' => 'expected-dashes-or-doctype'
1223 $this->token = array(
1224 'data' => (string) $alpha,
1225 'type' => self::COMMENT
1227 $state = 'bogusComment';
1231 case 'commentStart':
1232 /* Consume the next input character: */
1233 $char = $this->stream->char();
1235 if ($char === '-') {
1236 /* U+002D HYPHEN-MINUS (-)
1237 Switch to the comment start dash state. */
1238 $state = 'commentStartDash';
1239 } elseif ($char === '>') {
1240 /* U+003E GREATER-THAN SIGN (>)
1241 Parse error. Emit the comment token. Switch to the
1243 $this->emitToken(array(
1244 'type' => self::PARSEERROR,
1245 'data' => 'incorrect-comment'
1247 $this->emitToken($this->token);
1249 } elseif ($char === false) {
1251 Parse error. Emit the comment token. Reconsume the
1252 EOF character in the data state. */
1253 $this->emitToken(array(
1254 'type' => self::PARSEERROR,
1255 'data' => 'eof-in-comment'
1257 $this->emitToken($this->token);
1258 $this->stream->unget();
1262 Append the input character to the comment token's
1263 data. Switch to the comment state. */
1264 $this->token['data'] .= $char;
1269 case 'commentStartDash':
1270 /* Consume the next input character: */
1271 $char = $this->stream->char();
1272 if ($char === '-') {
1273 /* U+002D HYPHEN-MINUS (-)
1274 Switch to the comment end state */
1275 $state = 'commentEnd';
1276 } elseif ($char === '>') {
1277 /* U+003E GREATER-THAN SIGN (>)
1278 Parse error. Emit the comment token. Switch to the
1280 $this->emitToken(array(
1281 'type' => self::PARSEERROR,
1282 'data' => 'incorrect-comment'
1284 $this->emitToken($this->token);
1286 } elseif ($char === false) {
1287 /* Parse error. Emit the comment token. Reconsume the
1288 EOF character in the data state. */
1289 $this->emitToken(array(
1290 'type' => self::PARSEERROR,
1291 'data' => 'eof-in-comment'
1293 $this->emitToken($this->token);
1294 $this->stream->unget();
1297 $this->token['data'] .= '-' . $char;
1303 /* Consume the next input character: */
1304 $char = $this->stream->char();
1307 /* U+002D HYPHEN-MINUS (-)
1308 Switch to the comment end dash state */
1309 $state = 'commentEndDash';
1311 } elseif($char === false) {
1313 Parse error. Emit the comment token. Reconsume the EOF character
1314 in the data state. */
1315 $this->emitToken(array(
1316 'type' => self::PARSEERROR,
1317 'data' => 'eof-in-comment'
1319 $this->emitToken($this->token);
1320 $this->stream->unget();
1325 Append the input character to the comment token's data. Stay in
1326 the comment state. */
1327 $chars = $this->stream->charsUntil('-');
1329 $this->token['data'] .= $char . $chars;
1333 case 'commentEndDash':
1334 /* Consume the next input character: */
1335 $char = $this->stream->char();
1338 /* U+002D HYPHEN-MINUS (-)
1339 Switch to the comment end state */
1340 $state = 'commentEnd';
1342 } elseif($char === false) {
1344 Parse error. Emit the comment token. Reconsume the EOF character
1345 in the data state. */
1346 $this->emitToken(array(
1347 'type' => self::PARSEERROR,
1348 'data' => 'eof-in-comment-end-dash'
1350 $this->emitToken($this->token);
1351 $this->stream->unget();
1356 Append a U+002D HYPHEN-MINUS (-) character and the input
1357 character to the comment token's data. Switch to the comment state. */
1358 $this->token['data'] .= '-'.$char;
1364 /* Consume the next input character: */
1365 $char = $this->stream->char();
1368 /* U+003E GREATER-THAN SIGN (>)
1369 Emit the comment token. Switch to the data state. */
1370 $this->emitToken($this->token);
1373 } elseif($char === '-') {
1374 /* U+002D HYPHEN-MINUS (-)
1375 Parse error. Append a U+002D HYPHEN-MINUS (-) character
1376 to the comment token's data. Stay in the comment end
1378 $this->emitToken(array(
1379 'type' => self::PARSEERROR,
1380 'data' => 'unexpected-dash-after-double-dash-in-comment'
1382 $this->token['data'] .= '-';
1384 } elseif($char === false) {
1386 Parse error. Emit the comment token. Reconsume the
1387 EOF character in the data state. */
1388 $this->emitToken(array(
1389 'type' => self::PARSEERROR,
1390 'data' => 'eof-in-comment-double-dash'
1392 $this->emitToken($this->token);
1393 $this->stream->unget();
1398 Parse error. Append two U+002D HYPHEN-MINUS (-)
1399 characters and the input character to the comment token's
1400 data. Switch to the comment state. */
1401 $this->emitToken(array(
1402 'type' => self::PARSEERROR,
1403 'data' => 'unexpected-char-in-comment'
1405 $this->token['data'] .= '--'.$char;
1411 /* Consume the next input character: */
1412 $char = $this->stream->char();
1414 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1415 /* U+0009 CHARACTER TABULATION
1416 U+000A LINE FEED (LF)
1417 U+000C FORM FEED (FF)
1419 Switch to the before DOCTYPE name state. */
1420 $state = 'beforeDoctypeName';
1424 Parse error. Reconsume the current character in the
1425 before DOCTYPE name state. */
1426 $this->emitToken(array(
1427 'type' => self::PARSEERROR,
1428 'data' => 'need-space-after-doctype'
1430 $this->stream->unget();
1431 $state = 'beforeDoctypeName';
1435 case 'beforeDoctypeName':
1436 /* Consume the next input character: */
1437 $char = $this->stream->char();
1439 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1440 /* U+0009 CHARACTER TABULATION
1441 U+000A LINE FEED (LF)
1442 U+000C FORM FEED (FF)
1444 Stay in the before DOCTYPE name state. */
1446 } elseif($char === '>') {
1447 /* U+003E GREATER-THAN SIGN (>)
1448 Parse error. Create a new DOCTYPE token. Set its
1449 force-quirks flag to on. Emit the token. Switch to the
1451 $this->emitToken(array(
1452 'type' => self::PARSEERROR,
1453 'data' => 'expected-doctype-name-but-got-right-bracket'
1455 $this->emitToken(array(
1457 'type' => self::DOCTYPE,
1458 'force-quirks' => true,
1464 } elseif('A' <= $char && $char <= 'Z') {
1465 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1466 Create a new DOCTYPE token. Set the token's name to the
1467 lowercase version of the input character (add 0x0020 to
1468 the character's code point). Switch to the DOCTYPE name
1470 $this->token = array(
1471 'name' => strtolower($char),
1472 'type' => self::DOCTYPE,
1476 $state = 'doctypeName';
1478 } elseif($char === false) {
1480 Parse error. Create a new DOCTYPE token. Set its
1481 force-quirks flag to on. Emit the token. Reconsume the
1482 EOF character in the data state. */
1483 $this->emitToken(array(
1484 'type' => self::PARSEERROR,
1485 'data' => 'expected-doctype-name-but-got-eof'
1487 $this->emitToken(array(
1489 'type' => self::DOCTYPE,
1490 'force-quirks' => true,
1494 $this->stream->unget();
1499 Create a new DOCTYPE token. Set the token's name to the
1500 current input character. Switch to the DOCTYPE name state. */
1501 $this->token = array(
1503 'type' => self::DOCTYPE,
1507 $state = 'doctypeName';
1512 /* Consume the next input character: */
1513 $char = $this->stream->char();
1515 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1516 /* U+0009 CHARACTER TABULATION
1517 U+000A LINE FEED (LF)
1518 U+000C FORM FEED (FF)
1520 Switch to the after DOCTYPE name state. */
1521 $state = 'afterDoctypeName';
1523 } elseif($char === '>') {
1524 /* U+003E GREATER-THAN SIGN (>)
1525 Emit the current DOCTYPE token. Switch to the data state. */
1526 $this->emitToken($this->token);
1529 } elseif('A' <= $char && $char <= 'Z') {
1530 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1531 Append the lowercase version of the input character
1532 (add 0x0020 to the character's code point) to the current
1533 DOCTYPE token's name. Stay in the DOCTYPE name state. */
1534 $this->token['name'] .= strtolower($char);
1536 } elseif($char === false) {
1538 Parse error. Set the DOCTYPE token's force-quirks flag
1539 to on. Emit that DOCTYPE token. Reconsume the EOF
1540 character in the data state. */
1541 $this->emitToken(array(
1542 'type' => self::PARSEERROR,
1543 'data' => 'eof-in-doctype-name'
1545 $this->token['force-quirks'] = true;
1546 $this->emitToken($this->token);
1547 $this->stream->unget();
1552 Append the current input character to the current
1553 DOCTYPE token's name. Stay in the DOCTYPE name state. */
1554 $this->token['name'] .= $char;
1557 // XXX this is probably some sort of quirks mode designation,
1558 // check tree-builder to be sure. In general 'error' needs
1559 // to be specc'ified, this probably means removing it at the end
1560 $this->token['error'] = ($this->token['name'] === 'HTML')
1565 case 'afterDoctypeName':
1566 /* Consume the next input character: */
1567 $char = $this->stream->char();
1569 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1570 /* U+0009 CHARACTER TABULATION
1571 U+000A LINE FEED (LF)
1572 U+000C FORM FEED (FF)
1574 Stay in the after DOCTYPE name state. */
1576 } elseif($char === '>') {
1577 /* U+003E GREATER-THAN SIGN (>)
1578 Emit the current DOCTYPE token. Switch to the data state. */
1579 $this->emitToken($this->token);
1582 } elseif($char === false) {
1584 Parse error. Set the DOCTYPE token's force-quirks flag
1585 to on. Emit that DOCTYPE token. Reconsume the EOF
1586 character in the data state. */
1587 $this->emitToken(array(
1588 'type' => self::PARSEERROR,
1589 'data' => 'eof-in-doctype'
1591 $this->token['force-quirks'] = true;
1592 $this->emitToken($this->token);
1593 $this->stream->unget();
1599 $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
1600 if ($nextSix === 'PUBLIC') {
1601 /* If the next six characters are an ASCII
1602 case-insensitive match for the word "PUBLIC", then
1603 consume those characters and switch to the before
1604 DOCTYPE public identifier state. */
1605 $state = 'beforeDoctypePublicIdentifier';
1607 } elseif ($nextSix === 'SYSTEM') {
1608 /* Otherwise, if the next six characters are an ASCII
1609 case-insensitive match for the word "SYSTEM", then
1610 consume those characters and switch to the before
1611 DOCTYPE system identifier state. */
1612 $state = 'beforeDoctypeSystemIdentifier';
1615 /* Otherwise, this is the parse error. Set the DOCTYPE
1616 token's force-quirks flag to on. Switch to the bogus
1618 $this->emitToken(array(
1619 'type' => self::PARSEERROR,
1620 'data' => 'expected-space-or-right-bracket-in-doctype'
1622 $this->token['force-quirks'] = true;
1623 $this->token['error'] = true;
1624 $state = 'bogusDoctype';
1629 case 'beforeDoctypePublicIdentifier':
1630 /* Consume the next input character: */
1631 $char = $this->stream->char();
1633 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1634 /* U+0009 CHARACTER TABULATION
1635 U+000A LINE FEED (LF)
1636 U+000C FORM FEED (FF)
1638 Stay in the before DOCTYPE public identifier state. */
1639 } elseif ($char === '"') {
1640 /* U+0022 QUOTATION MARK (")
1641 Set the DOCTYPE token's public identifier to the empty
1642 string (not missing), then switch to the DOCTYPE public
1643 identifier (double-quoted) state. */
1644 $this->token['public'] = '';
1645 $state = 'doctypePublicIdentifierDoubleQuoted';
1646 } elseif ($char === "'") {
1647 /* U+0027 APOSTROPHE (')
1648 Set the DOCTYPE token's public identifier to the empty
1649 string (not missing), then switch to the DOCTYPE public
1650 identifier (single-quoted) state. */
1651 $this->token['public'] = '';
1652 $state = 'doctypePublicIdentifierSingleQuoted';
1653 } elseif ($char === '>') {
1654 /* Parse error. Set the DOCTYPE token's force-quirks flag
1655 to on. Emit that DOCTYPE token. Switch to the data state. */
1656 $this->emitToken(array(
1657 'type' => self::PARSEERROR,
1658 'data' => 'unexpected-end-of-doctype'
1660 $this->token['force-quirks'] = true;
1661 $this->emitToken($this->token);
1663 } elseif ($char === false) {
1664 /* Parse error. Set the DOCTYPE token's force-quirks
1665 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1666 character in the data state. */
1667 $this->emitToken(array(
1668 'type' => self::PARSEERROR,
1669 'data' => 'eof-in-doctype'
1671 $this->token['force-quirks'] = true;
1672 $this->emitToken($this->token);
1673 $this->stream->unget();
1676 /* Parse error. Set the DOCTYPE token's force-quirks flag
1677 to on. Switch to the bogus DOCTYPE state. */
1678 $this->emitToken(array(
1679 'type' => self::PARSEERROR,
1680 'data' => 'unexpected-char-in-doctype'
1682 $this->token['force-quirks'] = true;
1683 $state = 'bogusDoctype';
1687 case 'doctypePublicIdentifierDoubleQuoted':
1688 /* Consume the next input character: */
1689 $char = $this->stream->char();
1691 if ($char === '"') {
1692 /* U+0022 QUOTATION MARK (")
1693 Switch to the after DOCTYPE public identifier state. */
1694 $state = 'afterDoctypePublicIdentifier';
1695 } elseif ($char === '>') {
1696 /* U+003E GREATER-THAN SIGN (>)
1697 Parse error. Set the DOCTYPE token's force-quirks flag
1698 to on. Emit that DOCTYPE token. Switch to the data state. */
1699 $this->emitToken(array(
1700 'type' => self::PARSEERROR,
1701 'data' => 'unexpected-end-of-doctype'
1703 $this->token['force-quirks'] = true;
1704 $this->emitToken($this->token);
1706 } elseif ($char === false) {
1708 Parse error. Set the DOCTYPE token's force-quirks flag
1709 to on. Emit that DOCTYPE token. Reconsume the EOF
1710 character in the data state. */
1711 $this->emitToken(array(
1712 'type' => self::PARSEERROR,
1713 'data' => 'eof-in-doctype'
1715 $this->token['force-quirks'] = true;
1716 $this->emitToken($this->token);
1717 $this->stream->unget();
1721 Append the current input character to the current
1722 DOCTYPE token's public identifier. Stay in the DOCTYPE
1723 public identifier (double-quoted) state. */
1724 $this->token['public'] .= $char;
1728 case 'doctypePublicIdentifierSingleQuoted':
1729 /* Consume the next input character: */
1730 $char = $this->stream->char();
1732 if ($char === "'") {
1733 /* U+0027 APOSTROPHE (')
1734 Switch to the after DOCTYPE public identifier state. */
1735 $state = 'afterDoctypePublicIdentifier';
1736 } elseif ($char === '>') {
1737 /* U+003E GREATER-THAN SIGN (>)
1738 Parse error. Set the DOCTYPE token's force-quirks flag
1739 to on. Emit that DOCTYPE token. Switch to the data state. */
1740 $this->emitToken(array(
1741 'type' => self::PARSEERROR,
1742 'data' => 'unexpected-end-of-doctype'
1744 $this->token['force-quirks'] = true;
1745 $this->emitToken($this->token);
1747 } elseif ($char === false) {
1749 Parse error. Set the DOCTYPE token's force-quirks flag
1750 to on. Emit that DOCTYPE token. Reconsume the EOF
1751 character in the data state. */
1752 $this->emitToken(array(
1753 'type' => self::PARSEERROR,
1754 'data' => 'eof-in-doctype'
1756 $this->token['force-quirks'] = true;
1757 $this->emitToken($this->token);
1758 $this->stream->unget();
1762 Append the current input character to the current
1763 DOCTYPE token's public identifier. Stay in the DOCTYPE
1764 public identifier (double-quoted) state. */
1765 $this->token['public'] .= $char;
1769 case 'afterDoctypePublicIdentifier':
1770 /* Consume the next input character: */
1771 $char = $this->stream->char();
1773 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1774 /* U+0009 CHARACTER TABULATION
1775 U+000A LINE FEED (LF)
1776 U+000C FORM FEED (FF)
1778 Stay in the after DOCTYPE public identifier state. */
1779 } elseif ($char === '"') {
1780 /* U+0022 QUOTATION MARK (")
1781 Set the DOCTYPE token's system identifier to the
1782 empty string (not missing), then switch to the DOCTYPE
1783 system identifier (double-quoted) state. */
1784 $this->token['system'] = '';
1785 $state = 'doctypeSystemIdentifierDoubleQuoted';
1786 } elseif ($char === "'") {
1787 /* U+0027 APOSTROPHE (')
1788 Set the DOCTYPE token's system identifier to the
1789 empty string (not missing), then switch to the DOCTYPE
1790 system identifier (single-quoted) state. */
1791 $this->token['system'] = '';
1792 $state = 'doctypeSystemIdentifierSingleQuoted';
1793 } elseif ($char === '>') {
1794 /* U+003E GREATER-THAN SIGN (>)
1795 Emit the current DOCTYPE token. Switch to the data state. */
1796 $this->emitToken($this->token);
1798 } elseif ($char === false) {
1799 /* Parse error. Set the DOCTYPE token's force-quirks
1800 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1801 character in the data state. */
1802 $this->emitToken(array(
1803 'type' => self::PARSEERROR,
1804 'data' => 'eof-in-doctype'
1806 $this->token['force-quirks'] = true;
1807 $this->emitToken($this->token);
1808 $this->stream->unget();
1812 Parse error. Set the DOCTYPE token's force-quirks flag
1813 to on. Switch to the bogus DOCTYPE state. */
1814 $this->emitToken(array(
1815 'type' => self::PARSEERROR,
1816 'data' => 'unexpected-char-in-doctype'
1818 $this->token['force-quirks'] = true;
1819 $state = 'bogusDoctype';
1823 case 'beforeDoctypeSystemIdentifier':
1824 /* Consume the next input character: */
1825 $char = $this->stream->char();
1827 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1828 /* U+0009 CHARACTER TABULATION
1829 U+000A LINE FEED (LF)
1830 U+000C FORM FEED (FF)
1832 Stay in the before DOCTYPE system identifier state. */
1833 } elseif ($char === '"') {
1834 /* U+0022 QUOTATION MARK (")
1835 Set the DOCTYPE token's system identifier to the empty
1836 string (not missing), then switch to the DOCTYPE system
1837 identifier (double-quoted) state. */
1838 $this->token['system'] = '';
1839 $state = 'doctypeSystemIdentifierDoubleQuoted';
1840 } elseif ($char === "'") {
1841 /* U+0027 APOSTROPHE (')
1842 Set the DOCTYPE token's system identifier to the empty
1843 string (not missing), then switch to the DOCTYPE system
1844 identifier (single-quoted) state. */
1845 $this->token['system'] = '';
1846 $state = 'doctypeSystemIdentifierSingleQuoted';
1847 } elseif ($char === '>') {
1848 /* Parse error. Set the DOCTYPE token's force-quirks flag
1849 to on. Emit that DOCTYPE token. Switch to the data state. */
1850 $this->emitToken(array(
1851 'type' => self::PARSEERROR,
1852 'data' => 'unexpected-char-in-doctype'
1854 $this->token['force-quirks'] = true;
1855 $this->emitToken($this->token);
1857 } elseif ($char === false) {
1858 /* Parse error. Set the DOCTYPE token's force-quirks
1859 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1860 character in the data state. */
1861 $this->emitToken(array(
1862 'type' => self::PARSEERROR,
1863 'data' => 'eof-in-doctype'
1865 $this->token['force-quirks'] = true;
1866 $this->emitToken($this->token);
1867 $this->stream->unget();
1870 /* Parse error. Set the DOCTYPE token's force-quirks flag
1871 to on. Switch to the bogus DOCTYPE state. */
1872 $this->emitToken(array(
1873 'type' => self::PARSEERROR,
1874 'data' => 'unexpected-char-in-doctype'
1876 $this->token['force-quirks'] = true;
1877 $state = 'bogusDoctype';
1881 case 'doctypeSystemIdentifierDoubleQuoted':
1882 /* Consume the next input character: */
1883 $char = $this->stream->char();
1885 if ($char === '"') {
1886 /* U+0022 QUOTATION MARK (")
1887 Switch to the after DOCTYPE system identifier state. */
1888 $state = 'afterDoctypeSystemIdentifier';
1889 } elseif ($char === '>') {
1890 /* U+003E GREATER-THAN SIGN (>)
1891 Parse error. Set the DOCTYPE token's force-quirks flag
1892 to on. Emit that DOCTYPE token. Switch to the data state. */
1893 $this->emitToken(array(
1894 'type' => self::PARSEERROR,
1895 'data' => 'unexpected-end-of-doctype'
1897 $this->token['force-quirks'] = true;
1898 $this->emitToken($this->token);
1900 } elseif ($char === false) {
1902 Parse error. Set the DOCTYPE token's force-quirks flag
1903 to on. Emit that DOCTYPE token. Reconsume the EOF
1904 character in the data state. */
1905 $this->emitToken(array(
1906 'type' => self::PARSEERROR,
1907 'data' => 'eof-in-doctype'
1909 $this->token['force-quirks'] = true;
1910 $this->emitToken($this->token);
1911 $this->stream->unget();
1915 Append the current input character to the current
1916 DOCTYPE token's system identifier. Stay in the DOCTYPE
1917 system identifier (double-quoted) state. */
1918 $this->token['system'] .= $char;
1922 case 'doctypeSystemIdentifierSingleQuoted':
1923 /* Consume the next input character: */
1924 $char = $this->stream->char();
1926 if ($char === "'") {
1927 /* U+0027 APOSTROPHE (')
1928 Switch to the after DOCTYPE system identifier state. */
1929 $state = 'afterDoctypeSystemIdentifier';
1930 } elseif ($char === '>') {
1931 /* U+003E GREATER-THAN SIGN (>)
1932 Parse error. Set the DOCTYPE token's force-quirks flag
1933 to on. Emit that DOCTYPE token. Switch to the data state. */
1934 $this->emitToken(array(
1935 'type' => self::PARSEERROR,
1936 'data' => 'unexpected-end-of-doctype'
1938 $this->token['force-quirks'] = true;
1939 $this->emitToken($this->token);
1941 } elseif ($char === false) {
1943 Parse error. Set the DOCTYPE token's force-quirks flag
1944 to on. Emit that DOCTYPE token. Reconsume the EOF
1945 character in the data state. */
1946 $this->emitToken(array(
1947 'type' => self::PARSEERROR,
1948 'data' => 'eof-in-doctype'
1950 $this->token['force-quirks'] = true;
1951 $this->emitToken($this->token);
1952 $this->stream->unget();
1956 Append the current input character to the current
1957 DOCTYPE token's system identifier. Stay in the DOCTYPE
1958 system identifier (double-quoted) state. */
1959 $this->token['system'] .= $char;
1963 case 'afterDoctypeSystemIdentifier':
1964 /* Consume the next input character: */
1965 $char = $this->stream->char();
1967 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1968 /* U+0009 CHARACTER TABULATION
1969 U+000A LINE FEED (LF)
1970 U+000C FORM FEED (FF)
1972 Stay in the after DOCTYPE system identifier state. */
1973 } elseif ($char === '>') {
1974 /* U+003E GREATER-THAN SIGN (>)
1975 Emit the current DOCTYPE token. Switch to the data state. */
1976 $this->emitToken($this->token);
1978 } elseif ($char === false) {
1979 /* Parse error. Set the DOCTYPE token's force-quirks
1980 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1981 character in the data state. */
1982 $this->emitToken(array(
1983 'type' => self::PARSEERROR,
1984 'data' => 'eof-in-doctype'
1986 $this->token['force-quirks'] = true;
1987 $this->emitToken($this->token);
1988 $this->stream->unget();
1992 Parse error. Switch to the bogus DOCTYPE state.
1993 (This does not set the DOCTYPE token's force-quirks
1995 $this->emitToken(array(
1996 'type' => self::PARSEERROR,
1997 'data' => 'unexpected-char-in-doctype'
1999 $state = 'bogusDoctype';
2003 case 'bogusDoctype':
2004 /* Consume the next input character: */
2005 $char = $this->stream->char();
2007 if ($char === '>') {
2008 /* U+003E GREATER-THAN SIGN (>)
2009 Emit the DOCTYPE token. Switch to the data state. */
2010 $this->emitToken($this->token);
2013 } elseif($char === false) {
2015 Emit the DOCTYPE token. Reconsume the EOF character in
2017 $this->emitToken($this->token);
2018 $this->stream->unget();
2023 Stay in the bogus DOCTYPE state. */
2027 // case 'cdataSection':
2034 * Returns a serialized representation of the tree.
2036 public function save() {
2037 return $this->tree->save();
2041 * Returns the input stream.
2043 public function stream() {
2044 return $this->stream;
2047 private function consumeCharacterReference($allowed = false, $inattr = false) {
2048 // This goes quite far against spec, and is far closer to the Python
2049 // impl., mainly because we don't do the large unconsuming the spec
2052 // All consumed characters.
2053 $chars = $this->stream->char();
2055 /* This section defines how to consume a character
2056 reference. This definition is used when parsing character
2057 references in text and in attributes.
2059 The behavior depends on the identity of the next character
2060 (the one immediately after the U+0026 AMPERSAND character): */
2063 $chars[0] === "\x09" ||
2064 $chars[0] === "\x0A" ||
2065 $chars[0] === "\x0C" ||
2066 $chars[0] === "\x20" ||
2067 $chars[0] === '<' ||
2068 $chars[0] === '&' ||
2070 $chars[0] === $allowed
2072 /* U+0009 CHARACTER TABULATION
2073 U+000A LINE FEED (LF)
2074 U+000C FORM FEED (FF)
2076 U+003C LESS-THAN SIGN
2079 The additional allowed character, if there is one
2080 Not a character reference. No characters are consumed,
2081 and nothing is returned. (This is not an error, either.) */
2082 // We already consumed, so unconsume.
2083 $this->stream->unget();
2085 } elseif ($chars[0] === '#') {
2086 /* Consume the U+0023 NUMBER SIGN. */
2087 // Um, yeah, we already did that.
2088 /* The behavior further depends on the character after
2089 the U+0023 NUMBER SIGN: */
2090 $chars .= $this->stream->char();
2091 if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
2092 /* U+0078 LATIN SMALL LETTER X
2093 U+0058 LATIN CAPITAL LETTER X */
2094 /* Consume the X. */
2095 // Um, yeah, we already did that.
2096 /* Follow the steps below, but using the range of
2097 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2098 NINE, U+0061 LATIN SMALL LETTER A through to U+0066
2099 LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
2100 A, through to U+0046 LATIN CAPITAL LETTER F (in other
2101 words, 0123456789, ABCDEF, abcdef). */
2102 $char_class = self::HEX;
2103 /* When it comes to interpreting the
2104 number, interpret it as a hexadecimal number. */
2108 // Unconsume because we shouldn't have consumed this.
2110 $this->stream->unget();
2111 /* Follow the steps below, but using the range of
2112 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2113 NINE (i.e. just 0123456789). */
2114 $char_class = self::DIGIT;
2115 /* When it comes to interpreting the number,
2116 interpret it as a decimal number. */
2120 /* Consume as many characters as match the range of characters given above. */
2121 $consumed = $this->stream->charsWhile($char_class);
2122 if ($consumed === '' || $consumed === false) {
2123 /* If no characters match the range, then don't consume
2124 any characters (and unconsume the U+0023 NUMBER SIGN
2125 character and, if appropriate, the X character). This
2126 is a parse error; nothing is returned. */
2127 $this->emitToken(array(
2128 'type' => self::PARSEERROR,
2129 'data' => 'expected-numeric-entity'
2131 return '&' . $chars;
2133 /* Otherwise, if the next character is a U+003B SEMICOLON,
2134 consume that too. If it isn't, there is a parse error. */
2135 if ($this->stream->char() !== ';') {
2136 $this->stream->unget();
2137 $this->emitToken(array(
2138 'type' => self::PARSEERROR,
2139 'data' => 'numeric-entity-without-semicolon'
2143 /* If one or more characters match the range, then take
2144 them all and interpret the string of characters as a number
2145 (either hexadecimal or decimal as appropriate). */
2146 $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
2148 /* If that number is one of the numbers in the first column
2149 of the following table, then this is a parse error. Find the
2150 row with that number in the first column, and return a
2151 character token for the Unicode character given in the
2152 second column of that row. */
2153 $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
2154 if ($new_codepoint) {
2155 $this->emitToken(array(
2156 'type' => self::PARSEERROR,
2157 'data' => 'illegal-windows-1252-entity'
2159 $codepoint = $new_codepoint;
2161 /* Otherwise, if the number is in the range 0x0000 to 0x0008,
2162 U+000B, U+000E to 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF ,
2163 0xFDD0 to 0xFDEF, or is one of 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF,
2164 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
2165 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF,
2166 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE,
2167 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
2168 0x10FFFE, or 0x10FFFF, or is higher than 0x10FFFF, then this
2169 is a parse error; return a character token for the U+FFFD
2170 REPLACEMENT CHARACTER character instead. */
2171 // && has higher precedence than ||
2173 $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
2174 $codepoint === 0x000B ||
2175 $codepoint >= 0x000E && $codepoint <= 0x001F ||
2176 $codepoint >= 0x007F && $codepoint <= 0x009F ||
2177 $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
2178 $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
2179 ($codepoint & 0xFFFE) === 0xFFFE ||
2180 $codepoint > 0x10FFFF
2182 $this->emitToken(array(
2183 'type' => self::PARSEERROR,
2184 'data' => 'illegal-codepoint-for-numeric-entity'
2186 $codepoint = 0xFFFD;
2190 /* Otherwise, return a character token for the Unicode
2191 character whose code point is that number. */
2192 return HTML5_Data::utf8chr($codepoint);
2198 /* Consume the maximum number of characters possible,
2199 with the consumed characters matching one of the
2200 identifiers in the first column of the named character
2201 references table (in a case-sensitive manner). */
2203 // we will implement this by matching the longest
2204 // alphanumeric + semicolon string, and then working
2205 // our way backwards
2206 $chars .= $this->stream->charsWhile(self::DIGIT . self::ALPHA . ';', HTML5_Data::getNamedCharacterReferenceMaxLength() - 1);
2207 $len = strlen($chars);
2209 $refs = HTML5_Data::getNamedCharacterReferences();
2211 for($c = $len; $c > 0; $c--) {
2212 $id = substr($chars, 0, $c);
2213 if(isset($refs[$id])) {
2214 $codepoint = $refs[$id];
2219 /* If no match can be made, then this is a parse error.
2220 No characters are consumed, and nothing is returned. */
2222 $this->emitToken(array(
2223 'type' => self::PARSEERROR,
2224 'data' => 'expected-named-entity'
2226 return '&' . $chars;
2229 /* If the last character matched is not a U+003B SEMICOLON
2230 (;), there is a parse error. */
2232 if (substr($id, -1) !== ';') {
2233 $this->emitToken(array(
2234 'type' => self::PARSEERROR,
2235 'data' => 'named-entity-without-semicolon'
2241 /* If the character reference is being consumed as part of
2242 an attribute, and the last character matched is not a
2243 U+003B SEMICOLON (;), and the next character is in the
2244 range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
2245 LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
2246 or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
2247 then, for historical reasons, all the characters that were
2248 matched after the U+0026 AMPERSAND (&) must be unconsumed,
2249 and nothing is returned. */
2251 $inattr && !$semicolon &&
2252 strspn(substr($chars, $c, 1), self::ALPHA . self::DIGIT)
2254 return '&' . $chars;
2257 /* Otherwise, return a character token for the character
2258 corresponding to the character reference name (as given
2259 by the second column of the named character references table). */
2260 return HTML5_Data::utf8chr($codepoint) . substr($chars, $c);
2264 private function characterReferenceInAttributeValue($allowed = false) {
2265 /* Attempt to consume a character reference. */
2266 $entity = $this->consumeCharacterReference($allowed, true);
2268 /* If nothing is returned, append a U+0026 AMPERSAND
2269 character to the current attribute's value.
2271 Otherwise, append the returned character token to the
2272 current attribute's value. */
2277 $last = count($this->token['attr']) - 1;
2278 $this->token['attr'][$last]['value'] .= $char;
2280 /* Finally, switch back to the attribute value state that you
2281 were in when were switched into this state. */
2285 * Emits a token, passing it on to the tree builder.
2287 protected function emitToken($token, $checkStream = true) {
2289 // Emit errors from input stream.
2290 while ($this->stream->errors) {
2291 $this->emitToken(array_shift($this->stream->errors), false);
2295 // the current structure of attributes is not a terribly good one
2296 $this->tree->emitToken($token);
2298 if(is_int($this->tree->content_model)) {
2299 $this->content_model = $this->tree->content_model;
2300 $this->tree->content_model = null;
2302 } elseif($token['type'] === self::ENDTAG) {
2303 $this->content_model = self::PCDATA;