[ Index ] |
PHP Cross Reference of vtigercrm-6.1.0 |
[Summary view] [Print] [Text view]
1 <?php 2 3 /** 4 * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library. 5 * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts. 6 * 7 * @note 8 * Recent changes to PHP's DOM extension have resulted in some fatal 9 * error conditions with the original version of PH5P. Pending changes, 10 * this lexer will punt to DirectLex if DOM throughs an exception. 11 */ 12 13 class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex { 14 15 public function tokenizeHTML($html, $config, $context) { 16 $new_html = $this->normalize($html, $config, $context); 17 $new_html = $this->wrapHTML($new_html, $config, $context); 18 try { 19 $parser = new HTML5($new_html); 20 $doc = $parser->save(); 21 } catch (DOMException $e) { 22 // Uh oh, it failed. Punt to DirectLex. 23 $lexer = new HTMLPurifier_Lexer_DirectLex(); 24 $context->register('PH5PError', $e); // save the error, so we can detect it 25 return $lexer->tokenizeHTML($html, $config, $context); // use original HTML 26 } 27 $tokens = array(); 28 $this->tokenizeDOM( 29 $doc->getElementsByTagName('html')->item(0)-> // <html> 30 getElementsByTagName('body')->item(0)-> // <body> 31 getElementsByTagName('div')->item(0) // <div> 32 , $tokens); 33 return $tokens; 34 } 35 36 } 37 38 /* 39 40 Copyright 2007 Jeroen van der Meer <http://jero.net/> 41 42 Permission is hereby granted, free of charge, to any person obtaining a 43 copy of this software and associated documentation files (the 44 "Software"), to deal in the Software without restriction, including 45 without limitation the rights to use, copy, modify, merge, publish, 46 distribute, sublicense, and/or sell copies of the Software, and to 47 permit persons to whom the Software is furnished to do so, subject to 48 the following conditions: 49 50 The above copyright notice and this permission notice shall be included 51 in all copies or substantial portions of the Software. 52 53 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 54 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 55 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 56 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 57 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 58 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 59 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 60 61 */ 62 63 class HTML5 { 64 private $data; 65 private $char; 66 private $EOF; 67 private $state; 68 private $tree; 69 private $token; 70 private $content_model; 71 private $escape = false; 72 private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute', 73 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;', 74 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;', 75 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;', 76 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;', 77 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;', 78 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;', 79 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;', 80 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;', 81 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN', 82 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;', 83 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;', 84 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig', 85 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;', 86 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;', 87 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil', 88 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;', 89 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;', 90 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;', 91 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth', 92 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12', 93 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt', 94 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc', 95 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;', 96 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;', 97 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;', 98 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro', 99 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;', 100 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;', 101 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;', 102 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash', 103 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;', 104 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;', 105 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;', 106 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;', 107 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;', 108 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;', 109 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;', 110 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;', 111 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc', 112 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;', 113 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;'); 114 115 const PCDATA = 0; 116 const RCDATA = 1; 117 const CDATA = 2; 118 const PLAINTEXT = 3; 119 120 const DOCTYPE = 0; 121 const STARTTAG = 1; 122 const ENDTAG = 2; 123 const COMMENT = 3; 124 const CHARACTR = 4; 125 const EOF = 5; 126 127 public function __construct($data) { 128 $data = str_replace("\r\n", "\n", $data); 129 $data = str_replace("\r", null, $data); 130 131 $this->data = $data; 132 $this->char = -1; 133 $this->EOF = strlen($data); 134 $this->tree = new HTML5TreeConstructer; 135 $this->content_model = self::PCDATA; 136 137 $this->state = 'data'; 138 139 while($this->state !== null) { 140 $this->{$this->state.'State'}(); 141 } 142 } 143 144 public function save() { 145 return $this->tree->save(); 146 } 147 148 private function char() { 149 return ($this->char < $this->EOF) 150 ? $this->data[$this->char] 151 : false; 152 } 153 154 private function character($s, $l = 0) { 155 if($s + $l < $this->EOF) { 156 if($l === 0) { 157 return $this->data[$s]; 158 } else { 159 return substr($this->data, $s, $l); 160 } 161 } 162 } 163 164 private function characters($char_class, $start) { 165 return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start)); 166 } 167 168 private function dataState() { 169 // Consume the next input character 170 $this->char++; 171 $char = $this->char(); 172 173 if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) { 174 /* U+0026 AMPERSAND (&) 175 When the content model flag is set to one of the PCDATA or RCDATA 176 states: switch to the entity data state. Otherwise: treat it as per 177 the "anything else" entry below. */ 178 $this->state = 'entityData'; 179 180 } elseif($char === '-') { 181 /* If the content model flag is set to either the RCDATA state or 182 the CDATA state, and the escape flag is false, and there are at 183 least three characters before this one in the input stream, and the 184 last four characters in the input stream, including this one, are 185 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, 186 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */ 187 if(($this->content_model === self::RCDATA || $this->content_model === 188 self::CDATA) && $this->escape === false && 189 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') { 190 $this->escape = true; 191 } 192 193 /* In any case, emit the input character as a character token. Stay 194 in the data state. */ 195 $this->emitToken(array( 196 'type' => self::CHARACTR, 197 'data' => $char 198 )); 199 200 /* U+003C LESS-THAN SIGN (<) */ 201 } elseif($char === '<' && ($this->content_model === self::PCDATA || 202 (($this->content_model === self::RCDATA || 203 $this->content_model === self::CDATA) && $this->escape === false))) { 204 /* When the content model flag is set to the PCDATA state: switch 205 to the tag open state. 206 207 When the content model flag is set to either the RCDATA state or 208 the CDATA state and the escape flag is false: switch to the tag 209 open state. 210 211 Otherwise: treat it as per the "anything else" entry below. */ 212 $this->state = 'tagOpen'; 213 214 /* U+003E GREATER-THAN SIGN (>) */ 215 } elseif($char === '>') { 216 /* If the content model flag is set to either the RCDATA state or 217 the CDATA state, and the escape flag is true, and the last three 218 characters in the input stream including this one are U+002D 219 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"), 220 set the escape flag to false. */ 221 if(($this->content_model === self::RCDATA || 222 $this->content_model === self::CDATA) && $this->escape === true && 223 $this->character($this->char, 3) === '-->') { 224 $this->escape = false; 225 } 226 227 /* In any case, emit the input character as a character token. 228 Stay in the data state. */ 229 $this->emitToken(array( 230 'type' => self::CHARACTR, 231 'data' => $char 232 )); 233 234 } elseif($this->char === $this->EOF) { 235 /* EOF 236 Emit an end-of-file token. */ 237 $this->EOF(); 238 239 } elseif($this->content_model === self::PLAINTEXT) { 240 /* When the content model flag is set to the PLAINTEXT state 241 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of 242 the text and emit it as a character token. */ 243 $this->emitToken(array( 244 'type' => self::CHARACTR, 245 'data' => substr($this->data, $this->char) 246 )); 247 248 $this->EOF(); 249 250 } else { 251 /* Anything else 252 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that 253 otherwise would also be treated as a character token and emit it 254 as a single character token. Stay in the data state. */ 255 $len = strcspn($this->data, '<&', $this->char); 256 $char = substr($this->data, $this->char, $len); 257 $this->char += $len - 1; 258 259 $this->emitToken(array( 260 'type' => self::CHARACTR, 261 'data' => $char 262 )); 263 264 $this->state = 'data'; 265 } 266 } 267 268 private function entityDataState() { 269 // Attempt to consume an entity. 270 $entity = $this->entity(); 271 272 // If nothing is returned, emit a U+0026 AMPERSAND character token. 273 // Otherwise, emit the character token that was returned. 274 $char = (!$entity) ? '&' : $entity; 275 $this->emitToken(array( 276 'type' => self::CHARACTR, 277 'data' => $char 278 )); 279 280 // Finally, switch to the data state. 281 $this->state = 'data'; 282 } 283 284 private function tagOpenState() { 285 switch($this->content_model) { 286 case self::RCDATA: 287 case self::CDATA: 288 /* If the next input character is a U+002F SOLIDUS (/) character, 289 consume it and switch to the close tag open state. If the next 290 input character is not a U+002F SOLIDUS (/) character, emit a 291 U+003C LESS-THAN SIGN character token and switch to the data 292 state to process the next input character. */ 293 if($this->character($this->char + 1) === '/') { 294 $this->char++; 295 $this->state = 'closeTagOpen'; 296 297 } else { 298 $this->emitToken(array( 299 'type' => self::CHARACTR, 300 'data' => '<' 301 )); 302 303 $this->state = 'data'; 304 } 305 break; 306 307 case self::PCDATA: 308 // If the content model flag is set to the PCDATA state 309 // Consume the next input character: 310 $this->char++; 311 $char = $this->char(); 312 313 if($char === '!') { 314 /* U+0021 EXCLAMATION MARK (!) 315 Switch to the markup declaration open state. */ 316 $this->state = 'markupDeclarationOpen'; 317 318 } elseif($char === '/') { 319 /* U+002F SOLIDUS (/) 320 Switch to the close tag open state. */ 321 $this->state = 'closeTagOpen'; 322 323 } elseif(preg_match('/^[A-Za-z]$/', $char)) { 324 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z 325 Create a new start tag token, set its tag name to the lowercase 326 version of the input character (add 0x0020 to the character's code 327 point), then switch to the tag name state. (Don't emit the token 328 yet; further details will be filled in before it is emitted.) */ 329 $this->token = array( 330 'name' => strtolower($char), 331 'type' => self::STARTTAG, 332 'attr' => array() 333 ); 334 335 $this->state = 'tagName'; 336 337 } elseif($char === '>') { 338 /* U+003E GREATER-THAN SIGN (>) 339 Parse error. Emit a U+003C LESS-THAN SIGN character token and a 340 U+003E GREATER-THAN SIGN character token. Switch to the data state. */ 341 $this->emitToken(array( 342 'type' => self::CHARACTR, 343 'data' => '<>' 344 )); 345 346 $this->state = 'data'; 347 348 } elseif($char === '?') { 349 /* U+003F QUESTION MARK (?) 350 Parse error. Switch to the bogus comment state. */ 351 $this->state = 'bogusComment'; 352 353 } else { 354 /* Anything else 355 Parse error. Emit a U+003C LESS-THAN SIGN character token and 356 reconsume the current input character in the data state. */ 357 $this->emitToken(array( 358 'type' => self::CHARACTR, 359 'data' => '<' 360 )); 361 362 $this->char--; 363 $this->state = 'data'; 364 } 365 break; 366 } 367 } 368 369 private function closeTagOpenState() { 370 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1)); 371 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName; 372 373 if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) && 374 (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/', 375 $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) { 376 /* If the content model flag is set to the RCDATA or CDATA states then 377 examine the next few characters. If they do not match the tag name of 378 the last start tag token emitted (case insensitively), or if they do but 379 they are not immediately followed by one of the following characters: 380 * U+0009 CHARACTER TABULATION 381 * U+000A LINE FEED (LF) 382 * U+000B LINE TABULATION 383 * U+000C FORM FEED (FF) 384 * U+0020 SPACE 385 * U+003E GREATER-THAN SIGN (>) 386 * U+002F SOLIDUS (/) 387 * EOF 388 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character 389 token, a U+002F SOLIDUS character token, and switch to the data state 390 to process the next input character. */ 391 $this->emitToken(array( 392 'type' => self::CHARACTR, 393 'data' => '</' 394 )); 395 396 $this->state = 'data'; 397 398 } else { 399 /* Otherwise, if the content model flag is set to the PCDATA state, 400 or if the next few characters do match that tag name, consume the 401 next input character: */ 402 $this->char++; 403 $char = $this->char(); 404 405 if(preg_match('/^[A-Za-z]$/', $char)) { 406 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z 407 Create a new end tag token, set its tag name to the lowercase version 408 of the input character (add 0x0020 to the character's code point), then 409 switch to the tag name state. (Don't emit the token yet; further details 410 will be filled in before it is emitted.) */ 411 $this->token = array( 412 'name' => strtolower($char), 413 'type' => self::ENDTAG 414 ); 415 416 $this->state = 'tagName'; 417 418 } elseif($char === '>') { 419 /* U+003E GREATER-THAN SIGN (>) 420 Parse error. Switch to the data state. */ 421 $this->state = 'data'; 422 423 } elseif($this->char === $this->EOF) { 424 /* EOF 425 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F 426 SOLIDUS character token. Reconsume the EOF character in the data state. */ 427 $this->emitToken(array( 428 'type' => self::CHARACTR, 429 'data' => '</' 430 )); 431 432 $this->char--; 433 $this->state = 'data'; 434 435 } else { 436 /* Parse error. Switch to the bogus comment state. */ 437 $this->state = 'bogusComment'; 438 } 439 } 440 } 441 442 private function tagNameState() { 443 // Consume the next input character: 444 $this->char++; 445 $char = $this->character($this->char); 446 447 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 448 /* U+0009 CHARACTER TABULATION 449 U+000A LINE FEED (LF) 450 U+000B LINE TABULATION 451 U+000C FORM FEED (FF) 452 U+0020 SPACE 453 Switch to the before attribute name state. */ 454 $this->state = 'beforeAttributeName'; 455 456 } elseif($char === '>') { 457 /* U+003E GREATER-THAN SIGN (>) 458 Emit the current tag token. Switch to the data state. */ 459 $this->emitToken($this->token); 460 $this->state = 'data'; 461 462 } elseif($this->char === $this->EOF) { 463 /* EOF 464 Parse error. Emit the current tag token. Reconsume the EOF 465 character in the data state. */ 466 $this->emitToken($this->token); 467 468 $this->char--; 469 $this->state = 'data'; 470 471 } elseif($char === '/') { 472 /* U+002F SOLIDUS (/) 473 Parse error unless this is a permitted slash. Switch to the before 474 attribute name state. */ 475 $this->state = 'beforeAttributeName'; 476 477 } else { 478 /* Anything else 479 Append the current input character to the current tag token's tag name. 480 Stay in the tag name state. */ 481 $this->token['name'] .= strtolower($char); 482 $this->state = 'tagName'; 483 } 484 } 485 486 private function beforeAttributeNameState() { 487 // Consume the next input character: 488 $this->char++; 489 $char = $this->character($this->char); 490 491 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 492 /* U+0009 CHARACTER TABULATION 493 U+000A LINE FEED (LF) 494 U+000B LINE TABULATION 495 U+000C FORM FEED (FF) 496 U+0020 SPACE 497 Stay in the before attribute name state. */ 498 $this->state = 'beforeAttributeName'; 499 500 } elseif($char === '>') { 501 /* U+003E GREATER-THAN SIGN (>) 502 Emit the current tag token. Switch to the data state. */ 503 $this->emitToken($this->token); 504 $this->state = 'data'; 505 506 } elseif($char === '/') { 507 /* U+002F SOLIDUS (/) 508 Parse error unless this is a permitted slash. Stay in the before 509 attribute name state. */ 510 $this->state = 'beforeAttributeName'; 511 512 } elseif($this->char === $this->EOF) { 513 /* EOF 514 Parse error. Emit the current tag token. Reconsume the EOF 515 character in the data state. */ 516 $this->emitToken($this->token); 517 518 $this->char--; 519 $this->state = 'data'; 520 521 } else { 522 /* Anything else 523 Start a new attribute in the current tag token. Set that attribute's 524 name to the current input character, and its value to the empty string. 525 Switch to the attribute name state. */ 526 $this->token['attr'][] = array( 527 'name' => strtolower($char), 528 'value' => null 529 ); 530 531 $this->state = 'attributeName'; 532 } 533 } 534 535 private function attributeNameState() { 536 // Consume the next input character: 537 $this->char++; 538 $char = $this->character($this->char); 539 540 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 541 /* U+0009 CHARACTER TABULATION 542 U+000A LINE FEED (LF) 543 U+000B LINE TABULATION 544 U+000C FORM FEED (FF) 545 U+0020 SPACE 546 Stay in the before attribute name state. */ 547 $this->state = 'afterAttributeName'; 548 549 } elseif($char === '=') { 550 /* U+003D EQUALS SIGN (=) 551 Switch to the before attribute value state. */ 552 $this->state = 'beforeAttributeValue'; 553 554 } elseif($char === '>') { 555 /* U+003E GREATER-THAN SIGN (>) 556 Emit the current tag token. Switch to the data state. */ 557 $this->emitToken($this->token); 558 $this->state = 'data'; 559 560 } elseif($char === '/' && $this->character($this->char + 1) !== '>') { 561 /* U+002F SOLIDUS (/) 562 Parse error unless this is a permitted slash. Switch to the before 563 attribute name state. */ 564 $this->state = 'beforeAttributeName'; 565 566 } elseif($this->char === $this->EOF) { 567 /* EOF 568 Parse error. Emit the current tag token. Reconsume the EOF 569 character in the data state. */ 570 $this->emitToken($this->token); 571 572 $this->char--; 573 $this->state = 'data'; 574 575 } else { 576 /* Anything else 577 Append the current input character to the current attribute's name. 578 Stay in the attribute name state. */ 579 $last = count($this->token['attr']) - 1; 580 $this->token['attr'][$last]['name'] .= strtolower($char); 581 582 $this->state = 'attributeName'; 583 } 584 } 585 586 private function afterAttributeNameState() { 587 // Consume the next input character: 588 $this->char++; 589 $char = $this->character($this->char); 590 591 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 592 /* U+0009 CHARACTER TABULATION 593 U+000A LINE FEED (LF) 594 U+000B LINE TABULATION 595 U+000C FORM FEED (FF) 596 U+0020 SPACE 597 Stay in the after attribute name state. */ 598 $this->state = 'afterAttributeName'; 599 600 } elseif($char === '=') { 601 /* U+003D EQUALS SIGN (=) 602 Switch to the before attribute value state. */ 603 $this->state = 'beforeAttributeValue'; 604 605 } elseif($char === '>') { 606 /* U+003E GREATER-THAN SIGN (>) 607 Emit the current tag token. Switch to the data state. */ 608 $this->emitToken($this->token); 609 $this->state = 'data'; 610 611 } elseif($char === '/' && $this->character($this->char + 1) !== '>') { 612 /* U+002F SOLIDUS (/) 613 Parse error unless this is a permitted slash. Switch to the 614 before attribute name state. */ 615 $this->state = 'beforeAttributeName'; 616 617 } elseif($this->char === $this->EOF) { 618 /* EOF 619 Parse error. Emit the current tag token. Reconsume the EOF 620 character in the data state. */ 621 $this->emitToken($this->token); 622 623 $this->char--; 624 $this->state = 'data'; 625 626 } else { 627 /* Anything else 628 Start a new attribute in the current tag token. Set that attribute's 629 name to the current input character, and its value to the empty string. 630 Switch to the attribute name state. */ 631 $this->token['attr'][] = array( 632 'name' => strtolower($char), 633 'value' => null 634 ); 635 636 $this->state = 'attributeName'; 637 } 638 } 639 640 private function beforeAttributeValueState() { 641 // Consume the next input character: 642 $this->char++; 643 $char = $this->character($this->char); 644 645 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 646 /* U+0009 CHARACTER TABULATION 647 U+000A LINE FEED (LF) 648 U+000B LINE TABULATION 649 U+000C FORM FEED (FF) 650 U+0020 SPACE 651 Stay in the before attribute value state. */ 652 $this->state = 'beforeAttributeValue'; 653 654 } elseif($char === '"') { 655 /* U+0022 QUOTATION MARK (") 656 Switch to the attribute value (double-quoted) state. */ 657 $this->state = 'attributeValueDoubleQuoted'; 658 659 } elseif($char === '&') { 660 /* U+0026 AMPERSAND (&) 661 Switch to the attribute value (unquoted) state and reconsume 662 this input character. */ 663 $this->char--; 664 $this->state = 'attributeValueUnquoted'; 665 666 } elseif($char === '\'') { 667 /* U+0027 APOSTROPHE (') 668 Switch to the attribute value (single-quoted) state. */ 669 $this->state = 'attributeValueSingleQuoted'; 670 671 } elseif($char === '>') { 672 /* U+003E GREATER-THAN SIGN (>) 673 Emit the current tag token. Switch to the data state. */ 674 $this->emitToken($this->token); 675 $this->state = 'data'; 676 677 } else { 678 /* Anything else 679 Append the current input character to the current attribute's value. 680 Switch to the attribute value (unquoted) state. */ 681 $last = count($this->token['attr']) - 1; 682 $this->token['attr'][$last]['value'] .= $char; 683 684 $this->state = 'attributeValueUnquoted'; 685 } 686 } 687 688 private function attributeValueDoubleQuotedState() { 689 // Consume the next input character: 690 $this->char++; 691 $char = $this->character($this->char); 692 693 if($char === '"') { 694 /* U+0022 QUOTATION MARK (") 695 Switch to the before attribute name state. */ 696 $this->state = 'beforeAttributeName'; 697 698 } elseif($char === '&') { 699 /* U+0026 AMPERSAND (&) 700 Switch to the entity in attribute value state. */ 701 $this->entityInAttributeValueState('double'); 702 703 } elseif($this->char === $this->EOF) { 704 /* EOF 705 Parse error. Emit the current tag token. Reconsume the character 706 in the data state. */ 707 $this->emitToken($this->token); 708 709 $this->char--; 710 $this->state = 'data'; 711 712 } else { 713 /* Anything else 714 Append the current input character to the current attribute's value. 715 Stay in the attribute value (double-quoted) state. */ 716 $last = count($this->token['attr']) - 1; 717 $this->token['attr'][$last]['value'] .= $char; 718 719 $this->state = 'attributeValueDoubleQuoted'; 720 } 721 } 722 723 private function attributeValueSingleQuotedState() { 724 // Consume the next input character: 725 $this->char++; 726 $char = $this->character($this->char); 727 728 if($char === '\'') { 729 /* U+0022 QUOTATION MARK (') 730 Switch to the before attribute name state. */ 731 $this->state = 'beforeAttributeName'; 732 733 } elseif($char === '&') { 734 /* U+0026 AMPERSAND (&) 735 Switch to the entity in attribute value state. */ 736 $this->entityInAttributeValueState('single'); 737 738 } elseif($this->char === $this->EOF) { 739 /* EOF 740 Parse error. Emit the current tag token. Reconsume the character 741 in the data state. */ 742 $this->emitToken($this->token); 743 744 $this->char--; 745 $this->state = 'data'; 746 747 } else { 748 /* Anything else 749 Append the current input character to the current attribute's value. 750 Stay in the attribute value (single-quoted) state. */ 751 $last = count($this->token['attr']) - 1; 752 $this->token['attr'][$last]['value'] .= $char; 753 754 $this->state = 'attributeValueSingleQuoted'; 755 } 756 } 757 758 private function attributeValueUnquotedState() { 759 // Consume the next input character: 760 $this->char++; 761 $char = $this->character($this->char); 762 763 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 764 /* U+0009 CHARACTER TABULATION 765 U+000A LINE FEED (LF) 766 U+000B LINE TABULATION 767 U+000C FORM FEED (FF) 768 U+0020 SPACE 769 Switch to the before attribute name state. */ 770 $this->state = 'beforeAttributeName'; 771 772 } elseif($char === '&') { 773 /* U+0026 AMPERSAND (&) 774 Switch to the entity in attribute value state. */ 775 $this->entityInAttributeValueState(); 776 777 } elseif($char === '>') { 778 /* U+003E GREATER-THAN SIGN (>) 779 Emit the current tag token. Switch to the data state. */ 780 $this->emitToken($this->token); 781 $this->state = 'data'; 782 783 } else { 784 /* Anything else 785 Append the current input character to the current attribute's value. 786 Stay in the attribute value (unquoted) state. */ 787 $last = count($this->token['attr']) - 1; 788 $this->token['attr'][$last]['value'] .= $char; 789 790 $this->state = 'attributeValueUnquoted'; 791 } 792 } 793 794 private function entityInAttributeValueState() { 795 // Attempt to consume an entity. 796 $entity = $this->entity(); 797 798 // If nothing is returned, append a U+0026 AMPERSAND character to the 799 // current attribute's value. Otherwise, emit the character token that 800 // was returned. 801 $char = (!$entity) 802 ? '&' 803 : $entity; 804 805 $last = count($this->token['attr']) - 1; 806 $this->token['attr'][$last]['value'] .= $char; 807 } 808 809 private function bogusCommentState() { 810 /* Consume every character up to the first U+003E GREATER-THAN SIGN 811 character (>) or the end of the file (EOF), whichever comes first. Emit 812 a comment token whose data is the concatenation of all the characters 813 starting from and including the character that caused the state machine 814 to switch into the bogus comment state, up to and including the last 815 consumed character before the U+003E character, if any, or up to the 816 end of the file otherwise. (If the comment was started by the end of 817 the file (EOF), the token is empty.) */ 818 $data = $this->characters('^>', $this->char); 819 $this->emitToken(array( 820 'data' => $data, 821 'type' => self::COMMENT 822 )); 823 824 $this->char += strlen($data); 825 826 /* Switch to the data state. */ 827 $this->state = 'data'; 828 829 /* If the end of the file was reached, reconsume the EOF character. */ 830 if($this->char === $this->EOF) { 831 $this->char = $this->EOF - 1; 832 } 833 } 834 835 private function markupDeclarationOpenState() { 836 /* If the next two characters are both U+002D HYPHEN-MINUS (-) 837 characters, consume those two characters, create a comment token whose 838 data is the empty string, and switch to the comment state. */ 839 if($this->character($this->char + 1, 2) === '--') { 840 $this->char += 2; 841 $this->state = 'comment'; 842 $this->token = array( 843 'data' => null, 844 'type' => self::COMMENT 845 ); 846 847 /* Otherwise if the next seven chacacters are a case-insensitive match 848 for the word "DOCTYPE", then consume those characters and switch to the 849 DOCTYPE state. */ 850 } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') { 851 $this->char += 7; 852 $this->state = 'doctype'; 853 854 /* Otherwise, is is a parse error. Switch to the bogus comment state. 855 The next character that is consumed, if any, is the first character 856 that will be in the comment. */ 857 } else { 858 $this->char++; 859 $this->state = 'bogusComment'; 860 } 861 } 862 863 private function commentState() { 864 /* Consume the next input character: */ 865 $this->char++; 866 $char = $this->char(); 867 868 /* U+002D HYPHEN-MINUS (-) */ 869 if($char === '-') { 870 /* Switch to the comment dash state */ 871 $this->state = 'commentDash'; 872 873 /* EOF */ 874 } elseif($this->char === $this->EOF) { 875 /* Parse error. Emit the comment token. Reconsume the EOF character 876 in the data state. */ 877 $this->emitToken($this->token); 878 $this->char--; 879 $this->state = 'data'; 880 881 /* Anything else */ 882 } else { 883 /* Append the input character to the comment token's data. Stay in 884 the comment state. */ 885 $this->token['data'] .= $char; 886 } 887 } 888 889 private function commentDashState() { 890 /* Consume the next input character: */ 891 $this->char++; 892 $char = $this->char(); 893 894 /* U+002D HYPHEN-MINUS (-) */ 895 if($char === '-') { 896 /* Switch to the comment end state */ 897 $this->state = 'commentEnd'; 898 899 /* EOF */ 900 } elseif($this->char === $this->EOF) { 901 /* Parse error. Emit the comment token. Reconsume the EOF character 902 in the data state. */ 903 $this->emitToken($this->token); 904 $this->char--; 905 $this->state = 'data'; 906 907 /* Anything else */ 908 } else { 909 /* Append a U+002D HYPHEN-MINUS (-) character and the input 910 character to the comment token's data. Switch to the comment state. */ 911 $this->token['data'] .= '-'.$char; 912 $this->state = 'comment'; 913 } 914 } 915 916 private function commentEndState() { 917 /* Consume the next input character: */ 918 $this->char++; 919 $char = $this->char(); 920 921 if($char === '>') { 922 $this->emitToken($this->token); 923 $this->state = 'data'; 924 925 } elseif($char === '-') { 926 $this->token['data'] .= '-'; 927 928 } elseif($this->char === $this->EOF) { 929 $this->emitToken($this->token); 930 $this->char--; 931 $this->state = 'data'; 932 933 } else { 934 $this->token['data'] .= '--'.$char; 935 $this->state = 'comment'; 936 } 937 } 938 939 private function doctypeState() { 940 /* Consume the next input character: */ 941 $this->char++; 942 $char = $this->char(); 943 944 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 945 $this->state = 'beforeDoctypeName'; 946 947 } else { 948 $this->char--; 949 $this->state = 'beforeDoctypeName'; 950 } 951 } 952 953 private function beforeDoctypeNameState() { 954 /* Consume the next input character: */ 955 $this->char++; 956 $char = $this->char(); 957 958 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 959 // Stay in the before DOCTYPE name state. 960 961 } elseif(preg_match('/^[a-z]$/', $char)) { 962 $this->token = array( 963 'name' => strtoupper($char), 964 'type' => self::DOCTYPE, 965 'error' => true 966 ); 967 968 $this->state = 'doctypeName'; 969 970 } elseif($char === '>') { 971 $this->emitToken(array( 972 'name' => null, 973 'type' => self::DOCTYPE, 974 'error' => true 975 )); 976 977 $this->state = 'data'; 978 979 } elseif($this->char === $this->EOF) { 980 $this->emitToken(array( 981 'name' => null, 982 'type' => self::DOCTYPE, 983 'error' => true 984 )); 985 986 $this->char--; 987 $this->state = 'data'; 988 989 } else { 990 $this->token = array( 991 'name' => $char, 992 'type' => self::DOCTYPE, 993 'error' => true 994 ); 995 996 $this->state = 'doctypeName'; 997 } 998 } 999 1000 private function doctypeNameState() { 1001 /* Consume the next input character: */ 1002 $this->char++; 1003 $char = $this->char(); 1004 1005 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1006 $this->state = 'AfterDoctypeName'; 1007 1008 } elseif($char === '>') { 1009 $this->emitToken($this->token); 1010 $this->state = 'data'; 1011 1012 } elseif(preg_match('/^[a-z]$/', $char)) { 1013 $this->token['name'] .= strtoupper($char); 1014 1015 } elseif($this->char === $this->EOF) { 1016 $this->emitToken($this->token); 1017 $this->char--; 1018 $this->state = 'data'; 1019 1020 } else { 1021 $this->token['name'] .= $char; 1022 } 1023 1024 $this->token['error'] = ($this->token['name'] === 'HTML') 1025 ? false 1026 : true; 1027 } 1028 1029 private function afterDoctypeNameState() { 1030 /* Consume the next input character: */ 1031 $this->char++; 1032 $char = $this->char(); 1033 1034 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1035 // Stay in the DOCTYPE name state. 1036 1037 } elseif($char === '>') { 1038 $this->emitToken($this->token); 1039 $this->state = 'data'; 1040 1041 } elseif($this->char === $this->EOF) { 1042 $this->emitToken($this->token); 1043 $this->char--; 1044 $this->state = 'data'; 1045 1046 } else { 1047 $this->token['error'] = true; 1048 $this->state = 'bogusDoctype'; 1049 } 1050 } 1051 1052 private function bogusDoctypeState() { 1053 /* Consume the next input character: */ 1054 $this->char++; 1055 $char = $this->char(); 1056 1057 if($char === '>') { 1058 $this->emitToken($this->token); 1059 $this->state = 'data'; 1060 1061 } elseif($this->char === $this->EOF) { 1062 $this->emitToken($this->token); 1063 $this->char--; 1064 $this->state = 'data'; 1065 1066 } else { 1067 // Stay in the bogus DOCTYPE state. 1068 } 1069 } 1070 1071 private function entity() { 1072 $start = $this->char; 1073 1074 // This section defines how to consume an entity. This definition is 1075 // used when parsing entities in text and in attributes. 1076 1077 // The behaviour depends on the identity of the next character (the 1078 // one immediately after the U+0026 AMPERSAND character): 1079 1080 switch($this->character($this->char + 1)) { 1081 // U+0023 NUMBER SIGN (#) 1082 case '#': 1083 1084 // The behaviour further depends on the character after the 1085 // U+0023 NUMBER SIGN: 1086 switch($this->character($this->char + 1)) { 1087 // U+0078 LATIN SMALL LETTER X 1088 // U+0058 LATIN CAPITAL LETTER X 1089 case 'x': 1090 case 'X': 1091 // Follow the steps below, but using the range of 1092 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT 1093 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066 1094 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER 1095 // A, through to U+0046 LATIN CAPITAL LETTER F (in other 1096 // words, 0-9, A-F, a-f). 1097 $char = 1; 1098 $char_class = '0-9A-Fa-f'; 1099 break; 1100 1101 // Anything else 1102 default: 1103 // Follow the steps below, but using the range of 1104 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT 1105 // NINE (i.e. just 0-9). 1106 $char = 0; 1107 $char_class = '0-9'; 1108 break; 1109 } 1110 1111 // Consume as many characters as match the range of characters 1112 // given above. 1113 $this->char++; 1114 $e_name = $this->characters($char_class, $this->char + $char + 1); 1115 $entity = $this->character($start, $this->char); 1116 $cond = strlen($e_name) > 0; 1117 1118 // The rest of the parsing happens bellow. 1119 break; 1120 1121 // Anything else 1122 default: 1123 // Consume the maximum number of characters possible, with the 1124 // consumed characters case-sensitively matching one of the 1125 // identifiers in the first column of the entities table. 1126 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1); 1127 $len = strlen($e_name); 1128 1129 for($c = 1; $c <= $len; $c++) { 1130 $id = substr($e_name, 0, $c); 1131 $this->char++; 1132 1133 if(in_array($id, $this->entities)) { 1134 if ($e_name[$c-1] !== ';') { 1135 if ($c < $len && $e_name[$c] == ';') { 1136 $this->char++; // consume extra semicolon 1137 } 1138 } 1139 $entity = $id; 1140 break; 1141 } 1142 } 1143 1144 $cond = isset($entity); 1145 // The rest of the parsing happens bellow. 1146 break; 1147 } 1148 1149 if(!$cond) { 1150 // If no match can be made, then this is a parse error. No 1151 // characters are consumed, and nothing is returned. 1152 $this->char = $start; 1153 return false; 1154 } 1155 1156 // Return a character token for the character corresponding to the 1157 // entity name (as given by the second column of the entities table). 1158 return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8'); 1159 } 1160 1161 private function emitToken($token) { 1162 $emit = $this->tree->emitToken($token); 1163 1164 if(is_int($emit)) { 1165 $this->content_model = $emit; 1166 1167 } elseif($token['type'] === self::ENDTAG) { 1168 $this->content_model = self::PCDATA; 1169 } 1170 } 1171 1172 private function EOF() { 1173 $this->state = null; 1174 $this->tree->emitToken(array( 1175 'type' => self::EOF 1176 )); 1177 } 1178 } 1179 1180 class HTML5TreeConstructer { 1181 public $stack = array(); 1182 1183 private $phase; 1184 private $mode; 1185 private $dom; 1186 private $foster_parent = null; 1187 private $a_formatting = array(); 1188 1189 private $head_pointer = null; 1190 private $form_pointer = null; 1191 1192 private $scoping = array('button','caption','html','marquee','object','table','td','th'); 1193 private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u'); 1194 private $special = array('address','area','base','basefont','bgsound', 1195 'blockquote','body','br','center','col','colgroup','dd','dir','div','dl', 1196 'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5', 1197 'h6','head','hr','iframe','image','img','input','isindex','li','link', 1198 'listing','menu','meta','noembed','noframes','noscript','ol','optgroup', 1199 'option','p','param','plaintext','pre','script','select','spacer','style', 1200 'tbody','textarea','tfoot','thead','title','tr','ul','wbr'); 1201 1202 // The different phases. 1203 const INIT_PHASE = 0; 1204 const ROOT_PHASE = 1; 1205 const MAIN_PHASE = 2; 1206 const END_PHASE = 3; 1207 1208 // The different insertion modes for the main phase. 1209 const BEFOR_HEAD = 0; 1210 const IN_HEAD = 1; 1211 const AFTER_HEAD = 2; 1212 const IN_BODY = 3; 1213 const IN_TABLE = 4; 1214 const IN_CAPTION = 5; 1215 const IN_CGROUP = 6; 1216 const IN_TBODY = 7; 1217 const IN_ROW = 8; 1218 const IN_CELL = 9; 1219 const IN_SELECT = 10; 1220 const AFTER_BODY = 11; 1221 const IN_FRAME = 12; 1222 const AFTR_FRAME = 13; 1223 1224 // The different types of elements. 1225 const SPECIAL = 0; 1226 const SCOPING = 1; 1227 const FORMATTING = 2; 1228 const PHRASING = 3; 1229 1230 const MARKER = 0; 1231 1232 public function __construct() { 1233 $this->phase = self::INIT_PHASE; 1234 $this->mode = self::BEFOR_HEAD; 1235 $this->dom = new DOMDocument; 1236 1237 $this->dom->encoding = 'UTF-8'; 1238 $this->dom->preserveWhiteSpace = true; 1239 $this->dom->substituteEntities = true; 1240 $this->dom->strictErrorChecking = false; 1241 } 1242 1243 // Process tag tokens 1244 public function emitToken($token) { 1245 switch($this->phase) { 1246 case self::INIT_PHASE: return $this->initPhase($token); break; 1247 case self::ROOT_PHASE: return $this->rootElementPhase($token); break; 1248 case self::MAIN_PHASE: return $this->mainPhase($token); break; 1249 case self::END_PHASE : return $this->trailingEndPhase($token); break; 1250 } 1251 } 1252 1253 private function initPhase($token) { 1254 /* Initially, the tree construction stage must handle each token 1255 emitted from the tokenisation stage as follows: */ 1256 1257 /* A DOCTYPE token that is marked as being in error 1258 A comment token 1259 A start tag token 1260 An end tag token 1261 A character token that is not one of one of U+0009 CHARACTER TABULATION, 1262 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1263 or U+0020 SPACE 1264 An end-of-file token */ 1265 if((isset($token['error']) && $token['error']) || 1266 $token['type'] === HTML5::COMMENT || 1267 $token['type'] === HTML5::STARTTAG || 1268 $token['type'] === HTML5::ENDTAG || 1269 $token['type'] === HTML5::EOF || 1270 ($token['type'] === HTML5::CHARACTR && isset($token['data']) && 1271 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) { 1272 /* This specification does not define how to handle this case. In 1273 particular, user agents may ignore the entirety of this specification 1274 altogether for such documents, and instead invoke special parse modes 1275 with a greater emphasis on backwards compatibility. */ 1276 1277 $this->phase = self::ROOT_PHASE; 1278 return $this->rootElementPhase($token); 1279 1280 /* A DOCTYPE token marked as being correct */ 1281 } elseif(isset($token['error']) && !$token['error']) { 1282 /* Append a DocumentType node to the Document node, with the name 1283 attribute set to the name given in the DOCTYPE token (which will be 1284 "HTML"), and the other attributes specific to DocumentType objects 1285 set to null, empty lists, or the empty string as appropriate. */ 1286 $doctype = new DOMDocumentType(null, null, 'HTML'); 1287 1288 /* Then, switch to the root element phase of the tree construction 1289 stage. */ 1290 $this->phase = self::ROOT_PHASE; 1291 1292 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1293 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1294 or U+0020 SPACE */ 1295 } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/', 1296 $token['data'])) { 1297 /* Append that character to the Document node. */ 1298 $text = $this->dom->createTextNode($token['data']); 1299 $this->dom->appendChild($text); 1300 } 1301 } 1302 1303 private function rootElementPhase($token) { 1304 /* After the initial phase, as each token is emitted from the tokenisation 1305 stage, it must be processed as described in this section. */ 1306 1307 /* A DOCTYPE token */ 1308 if($token['type'] === HTML5::DOCTYPE) { 1309 // Parse error. Ignore the token. 1310 1311 /* A comment token */ 1312 } elseif($token['type'] === HTML5::COMMENT) { 1313 /* Append a Comment node to the Document object with the data 1314 attribute set to the data given in the comment token. */ 1315 $comment = $this->dom->createComment($token['data']); 1316 $this->dom->appendChild($comment); 1317 1318 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1319 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1320 or U+0020 SPACE */ 1321 } elseif($token['type'] === HTML5::CHARACTR && 1322 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 1323 /* Append that character to the Document node. */ 1324 $text = $this->dom->createTextNode($token['data']); 1325 $this->dom->appendChild($text); 1326 1327 /* A character token that is not one of U+0009 CHARACTER TABULATION, 1328 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED 1329 (FF), or U+0020 SPACE 1330 A start tag token 1331 An end tag token 1332 An end-of-file token */ 1333 } elseif(($token['type'] === HTML5::CHARACTR && 1334 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || 1335 $token['type'] === HTML5::STARTTAG || 1336 $token['type'] === HTML5::ENDTAG || 1337 $token['type'] === HTML5::EOF) { 1338 /* Create an HTMLElement node with the tag name html, in the HTML 1339 namespace. Append it to the Document object. Switch to the main 1340 phase and reprocess the current token. */ 1341 $html = $this->dom->createElement('html'); 1342 $this->dom->appendChild($html); 1343 $this->stack[] = $html; 1344 1345 $this->phase = self::MAIN_PHASE; 1346 return $this->mainPhase($token); 1347 } 1348 } 1349 1350 private function mainPhase($token) { 1351 /* Tokens in the main phase must be handled as follows: */ 1352 1353 /* A DOCTYPE token */ 1354 if($token['type'] === HTML5::DOCTYPE) { 1355 // Parse error. Ignore the token. 1356 1357 /* A start tag token with the tag name "html" */ 1358 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') { 1359 /* If this start tag token was not the first start tag token, then 1360 it is a parse error. */ 1361 1362 /* For each attribute on the token, check to see if the attribute 1363 is already present on the top element of the stack of open elements. 1364 If it is not, add the attribute and its corresponding value to that 1365 element. */ 1366 foreach($token['attr'] as $attr) { 1367 if(!$this->stack[0]->hasAttribute($attr['name'])) { 1368 $this->stack[0]->setAttribute($attr['name'], $attr['value']); 1369 } 1370 } 1371 1372 /* An end-of-file token */ 1373 } elseif($token['type'] === HTML5::EOF) { 1374 /* Generate implied end tags. */ 1375 $this->generateImpliedEndTags(); 1376 1377 /* Anything else. */ 1378 } else { 1379 /* Depends on the insertion mode: */ 1380 switch($this->mode) { 1381 case self::BEFOR_HEAD: return $this->beforeHead($token); break; 1382 case self::IN_HEAD: return $this->inHead($token); break; 1383 case self::AFTER_HEAD: return $this->afterHead($token); break; 1384 case self::IN_BODY: return $this->inBody($token); break; 1385 case self::IN_TABLE: return $this->inTable($token); break; 1386 case self::IN_CAPTION: return $this->inCaption($token); break; 1387 case self::IN_CGROUP: return $this->inColumnGroup($token); break; 1388 case self::IN_TBODY: return $this->inTableBody($token); break; 1389 case self::IN_ROW: return $this->inRow($token); break; 1390 case self::IN_CELL: return $this->inCell($token); break; 1391 case self::IN_SELECT: return $this->inSelect($token); break; 1392 case self::AFTER_BODY: return $this->afterBody($token); break; 1393 case self::IN_FRAME: return $this->inFrameset($token); break; 1394 case self::AFTR_FRAME: return $this->afterFrameset($token); break; 1395 case self::END_PHASE: return $this->trailingEndPhase($token); break; 1396 } 1397 } 1398 } 1399 1400 private function beforeHead($token) { 1401 /* Handle the token as follows: */ 1402 1403 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1404 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1405 or U+0020 SPACE */ 1406 if($token['type'] === HTML5::CHARACTR && 1407 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 1408 /* Append the character to the current node. */ 1409 $this->insertText($token['data']); 1410 1411 /* A comment token */ 1412 } elseif($token['type'] === HTML5::COMMENT) { 1413 /* Append a Comment node to the current node with the data attribute 1414 set to the data given in the comment token. */ 1415 $this->insertComment($token['data']); 1416 1417 /* A start tag token with the tag name "head" */ 1418 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') { 1419 /* Create an element for the token, append the new element to the 1420 current node and push it onto the stack of open elements. */ 1421 $element = $this->insertElement($token); 1422 1423 /* Set the head element pointer to this new element node. */ 1424 $this->head_pointer = $element; 1425 1426 /* Change the insertion mode to "in head". */ 1427 $this->mode = self::IN_HEAD; 1428 1429 /* A start tag token whose tag name is one of: "base", "link", "meta", 1430 "script", "style", "title". Or an end tag with the tag name "html". 1431 Or a character token that is not one of U+0009 CHARACTER TABULATION, 1432 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1433 or U+0020 SPACE. Or any other start tag token */ 1434 } elseif($token['type'] === HTML5::STARTTAG || 1435 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') || 1436 ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/', 1437 $token['data']))) { 1438 /* Act as if a start tag token with the tag name "head" and no 1439 attributes had been seen, then reprocess the current token. */ 1440 $this->beforeHead(array( 1441 'name' => 'head', 1442 'type' => HTML5::STARTTAG, 1443 'attr' => array() 1444 )); 1445 1446 return $this->inHead($token); 1447 1448 /* Any other end tag */ 1449 } elseif($token['type'] === HTML5::ENDTAG) { 1450 /* Parse error. Ignore the token. */ 1451 } 1452 } 1453 1454 private function inHead($token) { 1455 /* Handle the token as follows: */ 1456 1457 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1458 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1459 or U+0020 SPACE. 1460 1461 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style 1462 or script element, append the character to the current node regardless 1463 of its content. */ 1464 if(($token['type'] === HTML5::CHARACTR && 1465 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || ( 1466 $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName, 1467 array('title', 'style', 'script')))) { 1468 /* Append the character to the current node. */ 1469 $this->insertText($token['data']); 1470 1471 /* A comment token */ 1472 } elseif($token['type'] === HTML5::COMMENT) { 1473 /* Append a Comment node to the current node with the data attribute 1474 set to the data given in the comment token. */ 1475 $this->insertComment($token['data']); 1476 1477 } elseif($token['type'] === HTML5::ENDTAG && 1478 in_array($token['name'], array('title', 'style', 'script'))) { 1479 array_pop($this->stack); 1480 return HTML5::PCDATA; 1481 1482 /* A start tag with the tag name "title" */ 1483 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') { 1484 /* Create an element for the token and append the new element to the 1485 node pointed to by the head element pointer, or, if that is null 1486 (innerHTML case), to the current node. */ 1487 if($this->head_pointer !== null) { 1488 $element = $this->insertElement($token, false); 1489 $this->head_pointer->appendChild($element); 1490 1491 } else { 1492 $element = $this->insertElement($token); 1493 } 1494 1495 /* Switch the tokeniser's content model flag to the RCDATA state. */ 1496 return HTML5::RCDATA; 1497 1498 /* A start tag with the tag name "style" */ 1499 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') { 1500 /* Create an element for the token and append the new element to the 1501 node pointed to by the head element pointer, or, if that is null 1502 (innerHTML case), to the current node. */ 1503 if($this->head_pointer !== null) { 1504 $element = $this->insertElement($token, false); 1505 $this->head_pointer->appendChild($element); 1506 1507 } else { 1508 $this->insertElement($token); 1509 } 1510 1511 /* Switch the tokeniser's content model flag to the CDATA state. */ 1512 return HTML5::CDATA; 1513 1514 /* A start tag with the tag name "script" */ 1515 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') { 1516 /* Create an element for the token. */ 1517 $element = $this->insertElement($token, false); 1518 $this->head_pointer->appendChild($element); 1519 1520 /* Switch the tokeniser's content model flag to the CDATA state. */ 1521 return HTML5::CDATA; 1522 1523 /* A start tag with the tag name "base", "link", or "meta" */ 1524 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 1525 array('base', 'link', 'meta'))) { 1526 /* Create an element for the token and append the new element to the 1527 node pointed to by the head element pointer, or, if that is null 1528 (innerHTML case), to the current node. */ 1529 if($this->head_pointer !== null) { 1530 $element = $this->insertElement($token, false); 1531 $this->head_pointer->appendChild($element); 1532 array_pop($this->stack); 1533 1534 } else { 1535 $this->insertElement($token); 1536 } 1537 1538 /* An end tag with the tag name "head" */ 1539 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') { 1540 /* If the current node is a head element, pop the current node off 1541 the stack of open elements. */ 1542 if($this->head_pointer->isSameNode(end($this->stack))) { 1543 array_pop($this->stack); 1544 1545 /* Otherwise, this is a parse error. */ 1546 } else { 1547 // k 1548 } 1549 1550 /* Change the insertion mode to "after head". */ 1551 $this->mode = self::AFTER_HEAD; 1552 1553 /* A start tag with the tag name "head" or an end tag except "html". */ 1554 } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') || 1555 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) { 1556 // Parse error. Ignore the token. 1557 1558 /* Anything else */ 1559 } else { 1560 /* If the current node is a head element, act as if an end tag 1561 token with the tag name "head" had been seen. */ 1562 if($this->head_pointer->isSameNode(end($this->stack))) { 1563 $this->inHead(array( 1564 'name' => 'head', 1565 'type' => HTML5::ENDTAG 1566 )); 1567 1568 /* Otherwise, change the insertion mode to "after head". */ 1569 } else { 1570 $this->mode = self::AFTER_HEAD; 1571 } 1572 1573 /* Then, reprocess the current token. */ 1574 return $this->afterHead($token); 1575 } 1576 } 1577 1578 private function afterHead($token) { 1579 /* Handle the token as follows: */ 1580 1581 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1582 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1583 or U+0020 SPACE */ 1584 if($token['type'] === HTML5::CHARACTR && 1585 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 1586 /* Append the character to the current node. */ 1587 $this->insertText($token['data']); 1588 1589 /* A comment token */ 1590 } elseif($token['type'] === HTML5::COMMENT) { 1591 /* Append a Comment node to the current node with the data attribute 1592 set to the data given in the comment token. */ 1593 $this->insertComment($token['data']); 1594 1595 /* A start tag token with the tag name "body" */ 1596 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') { 1597 /* Insert a body element for the token. */ 1598 $this->insertElement($token); 1599 1600 /* Change the insertion mode to "in body". */ 1601 $this->mode = self::IN_BODY; 1602 1603 /* A start tag token with the tag name "frameset" */ 1604 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') { 1605 /* Insert a frameset element for the token. */ 1606 $this->insertElement($token); 1607 1608 /* Change the insertion mode to "in frameset". */ 1609 $this->mode = self::IN_FRAME; 1610 1611 /* A start tag token whose tag name is one of: "base", "link", "meta", 1612 "script", "style", "title" */ 1613 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 1614 array('base', 'link', 'meta', 'script', 'style', 'title'))) { 1615 /* Parse error. Switch the insertion mode back to "in head" and 1616 reprocess the token. */ 1617 $this->mode = self::IN_HEAD; 1618 return $this->inHead($token); 1619 1620 /* Anything else */ 1621 } else { 1622 /* Act as if a start tag token with the tag name "body" and no 1623 attributes had been seen, and then reprocess the current token. */ 1624 $this->afterHead(array( 1625 'name' => 'body', 1626 'type' => HTML5::STARTTAG, 1627 'attr' => array() 1628 )); 1629 1630 return $this->inBody($token); 1631 } 1632 } 1633 1634 private function inBody($token) { 1635 /* Handle the token as follows: */ 1636 1637 switch($token['type']) { 1638 /* A character token */ 1639 case HTML5::CHARACTR: 1640 /* Reconstruct the active formatting elements, if any. */ 1641 $this->reconstructActiveFormattingElements(); 1642 1643 /* Append the token's character to the current node. */ 1644 $this->insertText($token['data']); 1645 break; 1646 1647 /* A comment token */ 1648 case HTML5::COMMENT: 1649 /* Append a Comment node to the current node with the data 1650 attribute set to the data given in the comment token. */ 1651 $this->insertComment($token['data']); 1652 break; 1653 1654 case HTML5::STARTTAG: 1655 switch($token['name']) { 1656 /* A start tag token whose tag name is one of: "script", 1657 "style" */ 1658 case 'script': case 'style': 1659 /* Process the token as if the insertion mode had been "in 1660 head". */ 1661 return $this->inHead($token); 1662 break; 1663 1664 /* A start tag token whose tag name is one of: "base", "link", 1665 "meta", "title" */ 1666 case 'base': case 'link': case 'meta': case 'title': 1667 /* Parse error. Process the token as if the insertion mode 1668 had been "in head". */ 1669 return $this->inHead($token); 1670 break; 1671 1672 /* A start tag token with the tag name "body" */ 1673 case 'body': 1674 /* Parse error. If the second element on the stack of open 1675 elements is not a body element, or, if the stack of open 1676 elements has only one node on it, then ignore the token. 1677 (innerHTML case) */ 1678 if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') { 1679 // Ignore 1680 1681 /* Otherwise, for each attribute on the token, check to see 1682 if the attribute is already present on the body element (the 1683 second element) on the stack of open elements. If it is not, 1684 add the attribute and its corresponding value to that 1685 element. */ 1686 } else { 1687 foreach($token['attr'] as $attr) { 1688 if(!$this->stack[1]->hasAttribute($attr['name'])) { 1689 $this->stack[1]->setAttribute($attr['name'], $attr['value']); 1690 } 1691 } 1692 } 1693 break; 1694 1695 /* A start tag whose tag name is one of: "address", 1696 "blockquote", "center", "dir", "div", "dl", "fieldset", 1697 "listing", "menu", "ol", "p", "ul" */ 1698 case 'address': case 'blockquote': case 'center': case 'dir': 1699 case 'div': case 'dl': case 'fieldset': case 'listing': 1700 case 'menu': case 'ol': case 'p': case 'ul': 1701 /* If the stack of open elements has a p element in scope, 1702 then act as if an end tag with the tag name p had been 1703 seen. */ 1704 if($this->elementInScope('p')) { 1705 $this->emitToken(array( 1706 'name' => 'p', 1707 'type' => HTML5::ENDTAG 1708 )); 1709 } 1710 1711 /* Insert an HTML element for the token. */ 1712 $this->insertElement($token); 1713 break; 1714 1715 /* A start tag whose tag name is "form" */ 1716 case 'form': 1717 /* If the form element pointer is not null, ignore the 1718 token with a parse error. */ 1719 if($this->form_pointer !== null) { 1720 // Ignore. 1721 1722 /* Otherwise: */ 1723 } else { 1724 /* If the stack of open elements has a p element in 1725 scope, then act as if an end tag with the tag name p 1726 had been seen. */ 1727 if($this->elementInScope('p')) { 1728 $this->emitToken(array( 1729 'name' => 'p', 1730 'type' => HTML5::ENDTAG 1731 )); 1732 } 1733 1734 /* Insert an HTML element for the token, and set the 1735 form element pointer to point to the element created. */ 1736 $element = $this->insertElement($token); 1737 $this->form_pointer = $element; 1738 } 1739 break; 1740 1741 /* A start tag whose tag name is "li", "dd" or "dt" */ 1742 case 'li': case 'dd': case 'dt': 1743 /* If the stack of open elements has a p element in scope, 1744 then act as if an end tag with the tag name p had been 1745 seen. */ 1746 if($this->elementInScope('p')) { 1747 $this->emitToken(array( 1748 'name' => 'p', 1749 'type' => HTML5::ENDTAG 1750 )); 1751 } 1752 1753 $stack_length = count($this->stack) - 1; 1754 1755 for($n = $stack_length; 0 <= $n; $n--) { 1756 /* 1. Initialise node to be the current node (the 1757 bottommost node of the stack). */ 1758 $stop = false; 1759 $node = $this->stack[$n]; 1760 $cat = $this->getElementCategory($node->tagName); 1761 1762 /* 2. If node is an li, dd or dt element, then pop all 1763 the nodes from the current node up to node, including 1764 node, then stop this algorithm. */ 1765 if($token['name'] === $node->tagName || ($token['name'] !== 'li' 1766 && ($node->tagName === 'dd' || $node->tagName === 'dt'))) { 1767 for($x = $stack_length; $x >= $n ; $x--) { 1768 array_pop($this->stack); 1769 } 1770 1771 break; 1772 } 1773 1774 /* 3. If node is not in the formatting category, and is 1775 not in the phrasing category, and is not an address or 1776 div element, then stop this algorithm. */ 1777 if($cat !== self::FORMATTING && $cat !== self::PHRASING && 1778 $node->tagName !== 'address' && $node->tagName !== 'div') { 1779 break; 1780 } 1781 } 1782 1783 /* Finally, insert an HTML element with the same tag 1784 name as the token's. */ 1785 $this->insertElement($token); 1786 break; 1787 1788 /* A start tag token whose tag name is "plaintext" */ 1789 case 'plaintext': 1790 /* If the stack of open elements has a p element in scope, 1791 then act as if an end tag with the tag name p had been 1792 seen. */ 1793 if($this->elementInScope('p')) { 1794 $this->emitToken(array( 1795 'name' => 'p', 1796 'type' => HTML5::ENDTAG 1797 )); 1798 } 1799 1800 /* Insert an HTML element for the token. */ 1801 $this->insertElement($token); 1802 1803 return HTML5::PLAINTEXT; 1804 break; 1805 1806 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4", 1807 "h5", "h6" */ 1808 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': 1809 /* If the stack of open elements has a p element in scope, 1810 then act as if an end tag with the tag name p had been seen. */ 1811 if($this->elementInScope('p')) { 1812 $this->emitToken(array( 1813 'name' => 'p', 1814 'type' => HTML5::ENDTAG 1815 )); 1816 } 1817 1818 /* If the stack of open elements has in scope an element whose 1819 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then 1820 this is a parse error; pop elements from the stack until an 1821 element with one of those tag names has been popped from the 1822 stack. */ 1823 while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) { 1824 array_pop($this->stack); 1825 } 1826 1827 /* Insert an HTML element for the token. */ 1828 $this->insertElement($token); 1829 break; 1830 1831 /* A start tag whose tag name is "a" */ 1832 case 'a': 1833 /* If the list of active formatting elements contains 1834 an element whose tag name is "a" between the end of the 1835 list and the last marker on the list (or the start of 1836 the list if there is no marker on the list), then this 1837 is a parse error; act as if an end tag with the tag name 1838 "a" had been seen, then remove that element from the list 1839 of active formatting elements and the stack of open 1840 elements if the end tag didn't already remove it (it 1841 might not have if the element is not in table scope). */ 1842 $leng = count($this->a_formatting); 1843 1844 for($n = $leng - 1; $n >= 0; $n--) { 1845 if($this->a_formatting[$n] === self::MARKER) { 1846 break; 1847 1848 } elseif($this->a_formatting[$n]->nodeName === 'a') { 1849 $this->emitToken(array( 1850 'name' => 'a', 1851 'type' => HTML5::ENDTAG 1852 )); 1853 break; 1854 } 1855 } 1856 1857 /* Reconstruct the active formatting elements, if any. */ 1858 $this->reconstructActiveFormattingElements(); 1859 1860 /* Insert an HTML element for the token. */ 1861 $el = $this->insertElement($token); 1862 1863 /* Add that element to the list of active formatting 1864 elements. */ 1865 $this->a_formatting[] = $el; 1866 break; 1867 1868 /* A start tag whose tag name is one of: "b", "big", "em", "font", 1869 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ 1870 case 'b': case 'big': case 'em': case 'font': case 'i': 1871 case 'nobr': case 's': case 'small': case 'strike': 1872 case 'strong': case 'tt': case 'u': 1873 /* Reconstruct the active formatting elements, if any. */ 1874 $this->reconstructActiveFormattingElements(); 1875 1876 /* Insert an HTML element for the token. */ 1877 $el = $this->insertElement($token); 1878 1879 /* Add that element to the list of active formatting 1880 elements. */ 1881 $this->a_formatting[] = $el; 1882 break; 1883 1884 /* A start tag token whose tag name is "button" */ 1885 case 'button': 1886 /* If the stack of open elements has a button element in scope, 1887 then this is a parse error; act as if an end tag with the tag 1888 name "button" had been seen, then reprocess the token. (We don't 1889 do that. Unnecessary.) */ 1890 if($this->elementInScope('button')) { 1891 $this->inBody(array( 1892 'name' => 'button', 1893 'type' => HTML5::ENDTAG 1894 )); 1895 } 1896 1897 /* Reconstruct the active formatting elements, if any. */ 1898 $this->reconstructActiveFormattingElements(); 1899 1900 /* Insert an HTML element for the token. */ 1901 $this->insertElement($token); 1902 1903 /* Insert a marker at the end of the list of active 1904 formatting elements. */ 1905 $this->a_formatting[] = self::MARKER; 1906 break; 1907 1908 /* A start tag token whose tag name is one of: "marquee", "object" */ 1909 case 'marquee': case 'object': 1910 /* Reconstruct the active formatting elements, if any. */ 1911 $this->reconstructActiveFormattingElements(); 1912 1913 /* Insert an HTML element for the token. */ 1914 $this->insertElement($token); 1915 1916 /* Insert a marker at the end of the list of active 1917 formatting elements. */ 1918 $this->a_formatting[] = self::MARKER; 1919 break; 1920 1921 /* A start tag token whose tag name is "xmp" */ 1922 case 'xmp': 1923 /* Reconstruct the active formatting elements, if any. */ 1924 $this->reconstructActiveFormattingElements(); 1925 1926 /* Insert an HTML element for the token. */ 1927 $this->insertElement($token); 1928 1929 /* Switch the content model flag to the CDATA state. */ 1930 return HTML5::CDATA; 1931 break; 1932 1933 /* A start tag whose tag name is "table" */ 1934 case 'table': 1935 /* If the stack of open elements has a p element in scope, 1936 then act as if an end tag with the tag name p had been seen. */ 1937 if($this->elementInScope('p')) { 1938 $this->emitToken(array( 1939 'name' => 'p', 1940 'type' => HTML5::ENDTAG 1941 )); 1942 } 1943 1944 /* Insert an HTML element for the token. */ 1945 $this->insertElement($token); 1946 1947 /* Change the insertion mode to "in table". */ 1948 $this->mode = self::IN_TABLE; 1949 break; 1950 1951 /* A start tag whose tag name is one of: "area", "basefont", 1952 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */ 1953 case 'area': case 'basefont': case 'bgsound': case 'br': 1954 case 'embed': case 'img': case 'param': case 'spacer': 1955 case 'wbr': 1956 /* Reconstruct the active formatting elements, if any. */ 1957 $this->reconstructActiveFormattingElements(); 1958 1959 /* Insert an HTML element for the token. */ 1960 $this->insertElement($token); 1961 1962 /* Immediately pop the current node off the stack of open elements. */ 1963 array_pop($this->stack); 1964 break; 1965 1966 /* A start tag whose tag name is "hr" */ 1967 case 'hr': 1968 /* If the stack of open elements has a p element in scope, 1969 then act as if an end tag with the tag name p had been seen. */ 1970 if($this->elementInScope('p')) { 1971 $this->emitToken(array( 1972 'name' => 'p', 1973 'type' => HTML5::ENDTAG 1974 )); 1975 } 1976 1977 /* Insert an HTML element for the token. */ 1978 $this->insertElement($token); 1979 1980 /* Immediately pop the current node off the stack of open elements. */ 1981 array_pop($this->stack); 1982 break; 1983 1984 /* A start tag whose tag name is "image" */ 1985 case 'image': 1986 /* Parse error. Change the token's tag name to "img" and 1987 reprocess it. (Don't ask.) */ 1988 $token['name'] = 'img'; 1989 return $this->inBody($token); 1990 break; 1991 1992 /* A start tag whose tag name is "input" */ 1993 case 'input': 1994 /* Reconstruct the active formatting elements, if any. */ 1995 $this->reconstructActiveFormattingElements(); 1996 1997 /* Insert an input element for the token. */ 1998 $element = $this->insertElement($token, false); 1999 2000 /* If the form element pointer is not null, then associate the 2001 input element with the form element pointed to by the form 2002 element pointer. */ 2003 $this->form_pointer !== null 2004 ? $this->form_pointer->appendChild($element) 2005 : end($this->stack)->appendChild($element); 2006 2007 /* Pop that input element off the stack of open elements. */ 2008 array_pop($this->stack); 2009 break; 2010 2011 /* A start tag whose tag name is "isindex" */ 2012 case 'isindex': 2013 /* Parse error. */ 2014 // w/e 2015 2016 /* If the form element pointer is not null, 2017 then ignore the token. */ 2018 if($this->form_pointer === null) { 2019 /* Act as if a start tag token with the tag name "form" had 2020 been seen. */ 2021 $this->inBody(array( 2022 'name' => 'body', 2023 'type' => HTML5::STARTTAG, 2024 'attr' => array() 2025 )); 2026 2027 /* Act as if a start tag token with the tag name "hr" had 2028 been seen. */ 2029 $this->inBody(array( 2030 'name' => 'hr', 2031 'type' => HTML5::STARTTAG, 2032 'attr' => array() 2033 )); 2034 2035 /* Act as if a start tag token with the tag name "p" had 2036 been seen. */ 2037 $this->inBody(array( 2038 'name' => 'p', 2039 'type' => HTML5::STARTTAG, 2040 'attr' => array() 2041 )); 2042 2043 /* Act as if a start tag token with the tag name "label" 2044 had been seen. */ 2045 $this->inBody(array( 2046 'name' => 'label', 2047 'type' => HTML5::STARTTAG, 2048 'attr' => array() 2049 )); 2050 2051 /* Act as if a stream of character tokens had been seen. */ 2052 $this->insertText('This is a searchable index. '. 2053 'Insert your search keywords here: '); 2054 2055 /* Act as if a start tag token with the tag name "input" 2056 had been seen, with all the attributes from the "isindex" 2057 token, except with the "name" attribute set to the value 2058 "isindex" (ignoring any explicit "name" attribute). */ 2059 $attr = $token['attr']; 2060 $attr[] = array('name' => 'name', 'value' => 'isindex'); 2061 2062 $this->inBody(array( 2063 'name' => 'input', 2064 'type' => HTML5::STARTTAG, 2065 'attr' => $attr 2066 )); 2067 2068 /* Act as if a stream of character tokens had been seen 2069 (see below for what they should say). */ 2070 $this->insertText('This is a searchable index. '. 2071 'Insert your search keywords here: '); 2072 2073 /* Act as if an end tag token with the tag name "label" 2074 had been seen. */ 2075 $this->inBody(array( 2076 'name' => 'label', 2077 'type' => HTML5::ENDTAG 2078 )); 2079 2080 /* Act as if an end tag token with the tag name "p" had 2081 been seen. */ 2082 $this->inBody(array( 2083 'name' => 'p', 2084 'type' => HTML5::ENDTAG 2085 )); 2086 2087 /* Act as if a start tag token with the tag name "hr" had 2088 been seen. */ 2089 $this->inBody(array( 2090 'name' => 'hr', 2091 'type' => HTML5::ENDTAG 2092 )); 2093 2094 /* Act as if an end tag token with the tag name "form" had 2095 been seen. */ 2096 $this->inBody(array( 2097 'name' => 'form', 2098 'type' => HTML5::ENDTAG 2099 )); 2100 } 2101 break; 2102 2103 /* A start tag whose tag name is "textarea" */ 2104 case 'textarea': 2105 $this->insertElement($token); 2106 2107 /* Switch the tokeniser's content model flag to the 2108 RCDATA state. */ 2109 return HTML5::RCDATA; 2110 break; 2111 2112 /* A start tag whose tag name is one of: "iframe", "noembed", 2113 "noframes" */ 2114 case 'iframe': case 'noembed': case 'noframes': 2115 $this->insertElement($token); 2116 2117 /* Switch the tokeniser's content model flag to the CDATA state. */ 2118 return HTML5::CDATA; 2119 break; 2120 2121 /* A start tag whose tag name is "select" */ 2122 case 'select': 2123 /* Reconstruct the active formatting elements, if any. */ 2124 $this->reconstructActiveFormattingElements(); 2125 2126 /* Insert an HTML element for the token. */ 2127 $this->insertElement($token); 2128 2129 /* Change the insertion mode to "in select". */ 2130 $this->mode = self::IN_SELECT; 2131 break; 2132 2133 /* A start or end tag whose tag name is one of: "caption", "col", 2134 "colgroup", "frame", "frameset", "head", "option", "optgroup", 2135 "tbody", "td", "tfoot", "th", "thead", "tr". */ 2136 case 'caption': case 'col': case 'colgroup': case 'frame': 2137 case 'frameset': case 'head': case 'option': case 'optgroup': 2138 case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': 2139 case 'tr': 2140 // Parse error. Ignore the token. 2141 break; 2142 2143 /* A start or end tag whose tag name is one of: "event-source", 2144 "section", "nav", "article", "aside", "header", "footer", 2145 "datagrid", "command" */ 2146 case 'event-source': case 'section': case 'nav': case 'article': 2147 case 'aside': case 'header': case 'footer': case 'datagrid': 2148 case 'command': 2149 // Work in progress! 2150 break; 2151 2152 /* A start tag token not covered by the previous entries */ 2153 default: 2154 /* Reconstruct the active formatting elements, if any. */ 2155 $this->reconstructActiveFormattingElements(); 2156 2157 $this->insertElement($token, true, true); 2158 break; 2159 } 2160 break; 2161 2162 case HTML5::ENDTAG: 2163 switch($token['name']) { 2164 /* An end tag with the tag name "body" */ 2165 case 'body': 2166 /* If the second element in the stack of open elements is 2167 not a body element, this is a parse error. Ignore the token. 2168 (innerHTML case) */ 2169 if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') { 2170 // Ignore. 2171 2172 /* If the current node is not the body element, then this 2173 is a parse error. */ 2174 } elseif(end($this->stack)->nodeName !== 'body') { 2175 // Parse error. 2176 } 2177 2178 /* Change the insertion mode to "after body". */ 2179 $this->mode = self::AFTER_BODY; 2180 break; 2181 2182 /* An end tag with the tag name "html" */ 2183 case 'html': 2184 /* Act as if an end tag with tag name "body" had been seen, 2185 then, if that token wasn't ignored, reprocess the current 2186 token. */ 2187 $this->inBody(array( 2188 'name' => 'body', 2189 'type' => HTML5::ENDTAG 2190 )); 2191 2192 return $this->afterBody($token); 2193 break; 2194 2195 /* An end tag whose tag name is one of: "address", "blockquote", 2196 "center", "dir", "div", "dl", "fieldset", "listing", "menu", 2197 "ol", "pre", "ul" */ 2198 case 'address': case 'blockquote': case 'center': case 'dir': 2199 case 'div': case 'dl': case 'fieldset': case 'listing': 2200 case 'menu': case 'ol': case 'pre': case 'ul': 2201 /* If the stack of open elements has an element in scope 2202 with the same tag name as that of the token, then generate 2203 implied end tags. */ 2204 if($this->elementInScope($token['name'])) { 2205 $this->generateImpliedEndTags(); 2206 2207 /* Now, if the current node is not an element with 2208 the same tag name as that of the token, then this 2209 is a parse error. */ 2210 // w/e 2211 2212 /* If the stack of open elements has an element in 2213 scope with the same tag name as that of the token, 2214 then pop elements from this stack until an element 2215 with that tag name has been popped from the stack. */ 2216 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2217 if($this->stack[$n]->nodeName === $token['name']) { 2218 $n = -1; 2219 } 2220 2221 array_pop($this->stack); 2222 } 2223 } 2224 break; 2225 2226 /* An end tag whose tag name is "form" */ 2227 case 'form': 2228 /* If the stack of open elements has an element in scope 2229 with the same tag name as that of the token, then generate 2230 implied end tags. */ 2231 if($this->elementInScope($token['name'])) { 2232 $this->generateImpliedEndTags(); 2233 2234 } 2235 2236 if(end($this->stack)->nodeName !== $token['name']) { 2237 /* Now, if the current node is not an element with the 2238 same tag name as that of the token, then this is a parse 2239 error. */ 2240 // w/e 2241 2242 } else { 2243 /* Otherwise, if the current node is an element with 2244 the same tag name as that of the token pop that element 2245 from the stack. */ 2246 array_pop($this->stack); 2247 } 2248 2249 /* In any case, set the form element pointer to null. */ 2250 $this->form_pointer = null; 2251 break; 2252 2253 /* An end tag whose tag name is "p" */ 2254 case 'p': 2255 /* If the stack of open elements has a p element in scope, 2256 then generate implied end tags, except for p elements. */ 2257 if($this->elementInScope('p')) { 2258 $this->generateImpliedEndTags(array('p')); 2259 2260 /* If the current node is not a p element, then this is 2261 a parse error. */ 2262 // k 2263 2264 /* If the stack of open elements has a p element in 2265 scope, then pop elements from this stack until the stack 2266 no longer has a p element in scope. */ 2267 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2268 if($this->elementInScope('p')) { 2269 array_pop($this->stack); 2270 2271 } else { 2272 break; 2273 } 2274 } 2275 } 2276 break; 2277 2278 /* An end tag whose tag name is "dd", "dt", or "li" */ 2279 case 'dd': case 'dt': case 'li': 2280 /* If the stack of open elements has an element in scope 2281 whose tag name matches the tag name of the token, then 2282 generate implied end tags, except for elements with the 2283 same tag name as the token. */ 2284 if($this->elementInScope($token['name'])) { 2285 $this->generateImpliedEndTags(array($token['name'])); 2286 2287 /* If the current node is not an element with the same 2288 tag name as the token, then this is a parse error. */ 2289 // w/e 2290 2291 /* If the stack of open elements has an element in scope 2292 whose tag name matches the tag name of the token, then 2293 pop elements from this stack until an element with that 2294 tag name has been popped from the stack. */ 2295 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2296 if($this->stack[$n]->nodeName === $token['name']) { 2297 $n = -1; 2298 } 2299 2300 array_pop($this->stack); 2301 } 2302 } 2303 break; 2304 2305 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4", 2306 "h5", "h6" */ 2307 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': 2308 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'); 2309 2310 /* If the stack of open elements has in scope an element whose 2311 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then 2312 generate implied end tags. */ 2313 if($this->elementInScope($elements)) { 2314 $this->generateImpliedEndTags(); 2315 2316 /* Now, if the current node is not an element with the same 2317 tag name as that of the token, then this is a parse error. */ 2318 // w/e 2319 2320 /* If the stack of open elements has in scope an element 2321 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or 2322 "h6", then pop elements from the stack until an element 2323 with one of those tag names has been popped from the stack. */ 2324 while($this->elementInScope($elements)) { 2325 array_pop($this->stack); 2326 } 2327 } 2328 break; 2329 2330 /* An end tag whose tag name is one of: "a", "b", "big", "em", 2331 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ 2332 case 'a': case 'b': case 'big': case 'em': case 'font': 2333 case 'i': case 'nobr': case 's': case 'small': case 'strike': 2334 case 'strong': case 'tt': case 'u': 2335 /* 1. Let the formatting element be the last element in 2336 the list of active formatting elements that: 2337 * is between the end of the list and the last scope 2338 marker in the list, if any, or the start of the list 2339 otherwise, and 2340 * has the same tag name as the token. 2341 */ 2342 while(true) { 2343 for($a = count($this->a_formatting) - 1; $a >= 0; $a--) { 2344 if($this->a_formatting[$a] === self::MARKER) { 2345 break; 2346 2347 } elseif($this->a_formatting[$a]->tagName === $token['name']) { 2348 $formatting_element = $this->a_formatting[$a]; 2349 $in_stack = in_array($formatting_element, $this->stack, true); 2350 $fe_af_pos = $a; 2351 break; 2352 } 2353 } 2354 2355 /* If there is no such node, or, if that node is 2356 also in the stack of open elements but the element 2357 is not in scope, then this is a parse error. Abort 2358 these steps. The token is ignored. */ 2359 if(!isset($formatting_element) || ($in_stack && 2360 !$this->elementInScope($token['name']))) { 2361 break; 2362 2363 /* Otherwise, if there is such a node, but that node 2364 is not in the stack of open elements, then this is a 2365 parse error; remove the element from the list, and 2366 abort these steps. */ 2367 } elseif(isset($formatting_element) && !$in_stack) { 2368 unset($this->a_formatting[$fe_af_pos]); 2369 $this->a_formatting = array_merge($this->a_formatting); 2370 break; 2371 } 2372 2373 /* 2. Let the furthest block be the topmost node in the 2374 stack of open elements that is lower in the stack 2375 than the formatting element, and is not an element in 2376 the phrasing or formatting categories. There might 2377 not be one. */ 2378 $fe_s_pos = array_search($formatting_element, $this->stack, true); 2379 $length = count($this->stack); 2380 2381 for($s = $fe_s_pos + 1; $s < $length; $s++) { 2382 $category = $this->getElementCategory($this->stack[$s]->nodeName); 2383 2384 if($category !== self::PHRASING && $category !== self::FORMATTING) { 2385 $furthest_block = $this->stack[$s]; 2386 } 2387 } 2388 2389 /* 3. If there is no furthest block, then the UA must 2390 skip the subsequent steps and instead just pop all 2391 the nodes from the bottom of the stack of open 2392 elements, from the current node up to the formatting 2393 element, and remove the formatting element from the 2394 list of active formatting elements. */ 2395 if(!isset($furthest_block)) { 2396 for($n = $length - 1; $n >= $fe_s_pos; $n--) { 2397 array_pop($this->stack); 2398 } 2399 2400 unset($this->a_formatting[$fe_af_pos]); 2401 $this->a_formatting = array_merge($this->a_formatting); 2402 break; 2403 } 2404 2405 /* 4. Let the common ancestor be the element 2406 immediately above the formatting element in the stack 2407 of open elements. */ 2408 $common_ancestor = $this->stack[$fe_s_pos - 1]; 2409 2410 /* 5. If the furthest block has a parent node, then 2411 remove the furthest block from its parent node. */ 2412 if($furthest_block->parentNode !== null) { 2413 $furthest_block->parentNode->removeChild($furthest_block); 2414 } 2415 2416 /* 6. Let a bookmark note the position of the 2417 formatting element in the list of active formatting 2418 elements relative to the elements on either side 2419 of it in the list. */ 2420 $bookmark = $fe_af_pos; 2421 2422 /* 7. Let node and last node be the furthest block. 2423 Follow these steps: */ 2424 $node = $furthest_block; 2425 $last_node = $furthest_block; 2426 2427 while(true) { 2428 for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) { 2429 /* 7.1 Let node be the element immediately 2430 prior to node in the stack of open elements. */ 2431 $node = $this->stack[$n]; 2432 2433 /* 7.2 If node is not in the list of active 2434 formatting elements, then remove node from 2435 the stack of open elements and then go back 2436 to step 1. */ 2437 if(!in_array($node, $this->a_formatting, true)) { 2438 unset($this->stack[$n]); 2439 $this->stack = array_merge($this->stack); 2440 2441 } else { 2442 break; 2443 } 2444 } 2445 2446 /* 7.3 Otherwise, if node is the formatting 2447 element, then go to the next step in the overall 2448 algorithm. */ 2449 if($node === $formatting_element) { 2450 break; 2451 2452 /* 7.4 Otherwise, if last node is the furthest 2453 block, then move the aforementioned bookmark to 2454 be immediately after the node in the list of 2455 active formatting elements. */ 2456 } elseif($last_node === $furthest_block) { 2457 $bookmark = array_search($node, $this->a_formatting, true) + 1; 2458 } 2459 2460 /* 7.5 If node has any children, perform a 2461 shallow clone of node, replace the entry for 2462 node in the list of active formatting elements 2463 with an entry for the clone, replace the entry 2464 for node in the stack of open elements with an 2465 entry for the clone, and let node be the clone. */ 2466 if($node->hasChildNodes()) { 2467 $clone = $node->cloneNode(); 2468 $s_pos = array_search($node, $this->stack, true); 2469 $a_pos = array_search($node, $this->a_formatting, true); 2470 2471 $this->stack[$s_pos] = $clone; 2472 $this->a_formatting[$a_pos] = $clone; 2473 $node = $clone; 2474 } 2475 2476 /* 7.6 Insert last node into node, first removing 2477 it from its previous parent node if any. */ 2478 if($last_node->parentNode !== null) { 2479 $last_node->parentNode->removeChild($last_node); 2480 } 2481 2482 $node->appendChild($last_node); 2483 2484 /* 7.7 Let last node be node. */ 2485 $last_node = $node; 2486 } 2487 2488 /* 8. Insert whatever last node ended up being in 2489 the previous step into the common ancestor node, 2490 first removing it from its previous parent node if 2491 any. */ 2492 if($last_node->parentNode !== null) { 2493 $last_node->parentNode->removeChild($last_node); 2494 } 2495 2496 $common_ancestor->appendChild($last_node); 2497 2498 /* 9. Perform a shallow clone of the formatting 2499 element. */ 2500 $clone = $formatting_element->cloneNode(); 2501 2502 /* 10. Take all of the child nodes of the furthest 2503 block and append them to the clone created in the 2504 last step. */ 2505 while($furthest_block->hasChildNodes()) { 2506 $child = $furthest_block->firstChild; 2507 $furthest_block->removeChild($child); 2508 $clone->appendChild($child); 2509 } 2510 2511 /* 11. Append that clone to the furthest block. */ 2512 $furthest_block->appendChild($clone); 2513 2514 /* 12. Remove the formatting element from the list 2515 of active formatting elements, and insert the clone 2516 into the list of active formatting elements at the 2517 position of the aforementioned bookmark. */ 2518 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true); 2519 unset($this->a_formatting[$fe_af_pos]); 2520 $this->a_formatting = array_merge($this->a_formatting); 2521 2522 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1); 2523 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting)); 2524 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2); 2525 2526 /* 13. Remove the formatting element from the stack 2527 of open elements, and insert the clone into the stack 2528 of open elements immediately after (i.e. in a more 2529 deeply nested position than) the position of the 2530 furthest block in that stack. */ 2531 $fe_s_pos = array_search($formatting_element, $this->stack, true); 2532 $fb_s_pos = array_search($furthest_block, $this->stack, true); 2533 unset($this->stack[$fe_s_pos]); 2534 2535 $s_part1 = array_slice($this->stack, 0, $fb_s_pos); 2536 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack)); 2537 $this->stack = array_merge($s_part1, array($clone), $s_part2); 2538 2539 /* 14. Jump back to step 1 in this series of steps. */ 2540 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block); 2541 } 2542 break; 2543 2544 /* An end tag token whose tag name is one of: "button", 2545 "marquee", "object" */ 2546 case 'button': case 'marquee': case 'object': 2547 /* If the stack of open elements has an element in scope whose 2548 tag name matches the tag name of the token, then generate implied 2549 tags. */ 2550 if($this->elementInScope($token['name'])) { 2551 $this->generateImpliedEndTags(); 2552 2553 /* Now, if the current node is not an element with the same 2554 tag name as the token, then this is a parse error. */ 2555 // k 2556 2557 /* Now, if the stack of open elements has an element in scope 2558 whose tag name matches the tag name of the token, then pop 2559 elements from the stack until that element has been popped from 2560 the stack, and clear the list of active formatting elements up 2561 to the last marker. */ 2562 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2563 if($this->stack[$n]->nodeName === $token['name']) { 2564 $n = -1; 2565 } 2566 2567 array_pop($this->stack); 2568 } 2569 2570 $marker = end(array_keys($this->a_formatting, self::MARKER, true)); 2571 2572 for($n = count($this->a_formatting) - 1; $n > $marker; $n--) { 2573 array_pop($this->a_formatting); 2574 } 2575 } 2576 break; 2577 2578 /* Or an end tag whose tag name is one of: "area", "basefont", 2579 "bgsound", "br", "embed", "hr", "iframe", "image", "img", 2580 "input", "isindex", "noembed", "noframes", "param", "select", 2581 "spacer", "table", "textarea", "wbr" */ 2582 case 'area': case 'basefont': case 'bgsound': case 'br': 2583 case 'embed': case 'hr': case 'iframe': case 'image': 2584 case 'img': case 'input': case 'isindex': case 'noembed': 2585 case 'noframes': case 'param': case 'select': case 'spacer': 2586 case 'table': case 'textarea': case 'wbr': 2587 // Parse error. Ignore the token. 2588 break; 2589 2590 /* An end tag token not covered by the previous entries */ 2591 default: 2592 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2593 /* Initialise node to be the current node (the bottommost 2594 node of the stack). */ 2595 $node = end($this->stack); 2596 2597 /* If node has the same tag name as the end tag token, 2598 then: */ 2599 if($token['name'] === $node->nodeName) { 2600 /* Generate implied end tags. */ 2601 $this->generateImpliedEndTags(); 2602 2603 /* If the tag name of the end tag token does not 2604 match the tag name of the current node, this is a 2605 parse error. */ 2606 // k 2607 2608 /* Pop all the nodes from the current node up to 2609 node, including node, then stop this algorithm. */ 2610 for($x = count($this->stack) - $n; $x >= $n; $x--) { 2611 array_pop($this->stack); 2612 } 2613 2614 } else { 2615 $category = $this->getElementCategory($node); 2616 2617 if($category !== self::SPECIAL && $category !== self::SCOPING) { 2618 /* Otherwise, if node is in neither the formatting 2619 category nor the phrasing category, then this is a 2620 parse error. Stop this algorithm. The end tag token 2621 is ignored. */ 2622 return false; 2623 } 2624 } 2625 } 2626 break; 2627 } 2628 break; 2629 } 2630 } 2631 2632 private function inTable($token) { 2633 $clear = array('html', 'table'); 2634 2635 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 2636 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 2637 or U+0020 SPACE */ 2638 if($token['type'] === HTML5::CHARACTR && 2639 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 2640 /* Append the character to the current node. */ 2641 $text = $this->dom->createTextNode($token['data']); 2642 end($this->stack)->appendChild($text); 2643 2644 /* A comment token */ 2645 } elseif($token['type'] === HTML5::COMMENT) { 2646 /* Append a Comment node to the current node with the data 2647 attribute set to the data given in the comment token. */ 2648 $comment = $this->dom->createComment($token['data']); 2649 end($this->stack)->appendChild($comment); 2650 2651 /* A start tag whose tag name is "caption" */ 2652 } elseif($token['type'] === HTML5::STARTTAG && 2653 $token['name'] === 'caption') { 2654 /* Clear the stack back to a table context. */ 2655 $this->clearStackToTableContext($clear); 2656 2657 /* Insert a marker at the end of the list of active 2658 formatting elements. */ 2659 $this->a_formatting[] = self::MARKER; 2660 2661 /* Insert an HTML element for the token, then switch the 2662 insertion mode to "in caption". */ 2663 $this->insertElement($token); 2664 $this->mode = self::IN_CAPTION; 2665 2666 /* A start tag whose tag name is "colgroup" */ 2667 } elseif($token['type'] === HTML5::STARTTAG && 2668 $token['name'] === 'colgroup') { 2669 /* Clear the stack back to a table context. */ 2670 $this->clearStackToTableContext($clear); 2671 2672 /* Insert an HTML element for the token, then switch the 2673 insertion mode to "in column group". */ 2674 $this->insertElement($token); 2675 $this->mode = self::IN_CGROUP; 2676 2677 /* A start tag whose tag name is "col" */ 2678 } elseif($token['type'] === HTML5::STARTTAG && 2679 $token['name'] === 'col') { 2680 $this->inTable(array( 2681 'name' => 'colgroup', 2682 'type' => HTML5::STARTTAG, 2683 'attr' => array() 2684 )); 2685 2686 $this->inColumnGroup($token); 2687 2688 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */ 2689 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 2690 array('tbody', 'tfoot', 'thead'))) { 2691 /* Clear the stack back to a table context. */ 2692 $this->clearStackToTableContext($clear); 2693 2694 /* Insert an HTML element for the token, then switch the insertion 2695 mode to "in table body". */ 2696 $this->insertElement($token); 2697 $this->mode = self::IN_TBODY; 2698 2699 /* A start tag whose tag name is one of: "td", "th", "tr" */ 2700 } elseif($token['type'] === HTML5::STARTTAG && 2701 in_array($token['name'], array('td', 'th', 'tr'))) { 2702 /* Act as if a start tag token with the tag name "tbody" had been 2703 seen, then reprocess the current token. */ 2704 $this->inTable(array( 2705 'name' => 'tbody', 2706 'type' => HTML5::STARTTAG, 2707 'attr' => array() 2708 )); 2709 2710 return $this->inTableBody($token); 2711 2712 /* A start tag whose tag name is "table" */ 2713 } elseif($token['type'] === HTML5::STARTTAG && 2714 $token['name'] === 'table') { 2715 /* Parse error. Act as if an end tag token with the tag name "table" 2716 had been seen, then, if that token wasn't ignored, reprocess the 2717 current token. */ 2718 $this->inTable(array( 2719 'name' => 'table', 2720 'type' => HTML5::ENDTAG 2721 )); 2722 2723 return $this->mainPhase($token); 2724 2725 /* An end tag whose tag name is "table" */ 2726 } elseif($token['type'] === HTML5::ENDTAG && 2727 $token['name'] === 'table') { 2728 /* If the stack of open elements does not have an element in table 2729 scope with the same tag name as the token, this is a parse error. 2730 Ignore the token. (innerHTML case) */ 2731 if(!$this->elementInScope($token['name'], true)) { 2732 return false; 2733 2734 /* Otherwise: */ 2735 } else { 2736 /* Generate implied end tags. */ 2737 $this->generateImpliedEndTags(); 2738 2739 /* Now, if the current node is not a table element, then this 2740 is a parse error. */ 2741 // w/e 2742 2743 /* Pop elements from this stack until a table element has been 2744 popped from the stack. */ 2745 while(true) { 2746 $current = end($this->stack)->nodeName; 2747 array_pop($this->stack); 2748 2749 if($current === 'table') { 2750 break; 2751 } 2752 } 2753 2754 /* Reset the insertion mode appropriately. */ 2755 $this->resetInsertionMode(); 2756 } 2757 2758 /* An end tag whose tag name is one of: "body", "caption", "col", 2759 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ 2760 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], 2761 array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 2762 'tfoot', 'th', 'thead', 'tr'))) { 2763 // Parse error. Ignore the token. 2764 2765 /* Anything else */ 2766 } else { 2767 /* Parse error. Process the token as if the insertion mode was "in 2768 body", with the following exception: */ 2769 2770 /* If the current node is a table, tbody, tfoot, thead, or tr 2771 element, then, whenever a node would be inserted into the current 2772 node, it must instead be inserted into the foster parent element. */ 2773 if(in_array(end($this->stack)->nodeName, 2774 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) { 2775 /* The foster parent element is the parent element of the last 2776 table element in the stack of open elements, if there is a 2777 table element and it has such a parent element. If there is no 2778 table element in the stack of open elements (innerHTML case), 2779 then the foster parent element is the first element in the 2780 stack of open elements (the html element). Otherwise, if there 2781 is a table element in the stack of open elements, but the last 2782 table element in the stack of open elements has no parent, or 2783 its parent node is not an element, then the foster parent 2784 element is the element before the last table element in the 2785 stack of open elements. */ 2786 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2787 if($this->stack[$n]->nodeName === 'table') { 2788 $table = $this->stack[$n]; 2789 break; 2790 } 2791 } 2792 2793 if(isset($table) && $table->parentNode !== null) { 2794 $this->foster_parent = $table->parentNode; 2795 2796 } elseif(!isset($table)) { 2797 $this->foster_parent = $this->stack[0]; 2798 2799 } elseif(isset($table) && ($table->parentNode === null || 2800 $table->parentNode->nodeType !== XML_ELEMENT_NODE)) { 2801 $this->foster_parent = $this->stack[$n - 1]; 2802 } 2803 } 2804 2805 $this->inBody($token); 2806 } 2807 } 2808 2809 private function inCaption($token) { 2810 /* An end tag whose tag name is "caption" */ 2811 if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') { 2812 /* If the stack of open elements does not have an element in table 2813 scope with the same tag name as the token, this is a parse error. 2814 Ignore the token. (innerHTML case) */ 2815 if(!$this->elementInScope($token['name'], true)) { 2816 // Ignore 2817 2818 /* Otherwise: */ 2819 } else { 2820 /* Generate implied end tags. */ 2821 $this->generateImpliedEndTags(); 2822 2823 /* Now, if the current node is not a caption element, then this 2824 is a parse error. */ 2825 // w/e 2826 2827 /* Pop elements from this stack until a caption element has 2828 been popped from the stack. */ 2829 while(true) { 2830 $node = end($this->stack)->nodeName; 2831 array_pop($this->stack); 2832 2833 if($node === 'caption') { 2834 break; 2835 } 2836 } 2837 2838 /* Clear the list of active formatting elements up to the last 2839 marker. */ 2840 $this->clearTheActiveFormattingElementsUpToTheLastMarker(); 2841 2842 /* Switch the insertion mode to "in table". */ 2843 $this->mode = self::IN_TABLE; 2844 } 2845 2846 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 2847 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag 2848 name is "table" */ 2849 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'], 2850 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 2851 'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG && 2852 $token['name'] === 'table')) { 2853 /* Parse error. Act as if an end tag with the tag name "caption" 2854 had been seen, then, if that token wasn't ignored, reprocess the 2855 current token. */ 2856 $this->inCaption(array( 2857 'name' => 'caption', 2858 'type' => HTML5::ENDTAG 2859 )); 2860 2861 return $this->inTable($token); 2862 2863 /* An end tag whose tag name is one of: "body", "col", "colgroup", 2864 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ 2865 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], 2866 array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th', 2867 'thead', 'tr'))) { 2868 // Parse error. Ignore the token. 2869 2870 /* Anything else */ 2871 } else { 2872 /* Process the token as if the insertion mode was "in body". */ 2873 $this->inBody($token); 2874 } 2875 } 2876 2877 private function inColumnGroup($token) { 2878 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 2879 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 2880 or U+0020 SPACE */ 2881 if($token['type'] === HTML5::CHARACTR && 2882 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 2883 /* Append the character to the current node. */ 2884 $text = $this->dom->createTextNode($token['data']); 2885 end($this->stack)->appendChild($text); 2886 2887 /* A comment token */ 2888 } elseif($token['type'] === HTML5::COMMENT) { 2889 /* Append a Comment node to the current node with the data 2890 attribute set to the data given in the comment token. */ 2891 $comment = $this->dom->createComment($token['data']); 2892 end($this->stack)->appendChild($comment); 2893 2894 /* A start tag whose tag name is "col" */ 2895 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') { 2896 /* Insert a col element for the token. Immediately pop the current 2897 node off the stack of open elements. */ 2898 $this->insertElement($token); 2899 array_pop($this->stack); 2900 2901 /* An end tag whose tag name is "colgroup" */ 2902 } elseif($token['type'] === HTML5::ENDTAG && 2903 $token['name'] === 'colgroup') { 2904 /* If the current node is the root html element, then this is a 2905 parse error, ignore the token. (innerHTML case) */ 2906 if(end($this->stack)->nodeName === 'html') { 2907 // Ignore 2908 2909 /* Otherwise, pop the current node (which will be a colgroup 2910 element) from the stack of open elements. Switch the insertion 2911 mode to "in table". */ 2912 } else { 2913 array_pop($this->stack); 2914 $this->mode = self::IN_TABLE; 2915 } 2916 2917 /* An end tag whose tag name is "col" */ 2918 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') { 2919 /* Parse error. Ignore the token. */ 2920 2921 /* Anything else */ 2922 } else { 2923 /* Act as if an end tag with the tag name "colgroup" had been seen, 2924 and then, if that token wasn't ignored, reprocess the current token. */ 2925 $this->inColumnGroup(array( 2926 'name' => 'colgroup', 2927 'type' => HTML5::ENDTAG 2928 )); 2929 2930 return $this->inTable($token); 2931 } 2932 } 2933 2934 private function inTableBody($token) { 2935 $clear = array('tbody', 'tfoot', 'thead', 'html'); 2936 2937 /* A start tag whose tag name is "tr" */ 2938 if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') { 2939 /* Clear the stack back to a table body context. */ 2940 $this->clearStackToTableContext($clear); 2941 2942 /* Insert a tr element for the token, then switch the insertion 2943 mode to "in row". */ 2944 $this->insertElement($token); 2945 $this->mode = self::IN_ROW; 2946 2947 /* A start tag whose tag name is one of: "th", "td" */ 2948 } elseif($token['type'] === HTML5::STARTTAG && 2949 ($token['name'] === 'th' || $token['name'] === 'td')) { 2950 /* Parse error. Act as if a start tag with the tag name "tr" had 2951 been seen, then reprocess the current token. */ 2952 $this->inTableBody(array( 2953 'name' => 'tr', 2954 'type' => HTML5::STARTTAG, 2955 'attr' => array() 2956 )); 2957 2958 return $this->inRow($token); 2959 2960 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ 2961 } elseif($token['type'] === HTML5::ENDTAG && 2962 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) { 2963 /* If the stack of open elements does not have an element in table 2964 scope with the same tag name as the token, this is a parse error. 2965 Ignore the token. */ 2966 if(!$this->elementInScope($token['name'], true)) { 2967 // Ignore 2968 2969 /* Otherwise: */ 2970 } else { 2971 /* Clear the stack back to a table body context. */ 2972 $this->clearStackToTableContext($clear); 2973 2974 /* Pop the current node from the stack of open elements. Switch 2975 the insertion mode to "in table". */ 2976 array_pop($this->stack); 2977 $this->mode = self::IN_TABLE; 2978 } 2979 2980 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 2981 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */ 2982 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'], 2983 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) || 2984 ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) { 2985 /* If the stack of open elements does not have a tbody, thead, or 2986 tfoot element in table scope, this is a parse error. Ignore the 2987 token. (innerHTML case) */ 2988 if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) { 2989 // Ignore. 2990 2991 /* Otherwise: */ 2992 } else { 2993 /* Clear the stack back to a table body context. */ 2994 $this->clearStackToTableContext($clear); 2995 2996 /* Act as if an end tag with the same tag name as the current 2997 node ("tbody", "tfoot", or "thead") had been seen, then 2998 reprocess the current token. */ 2999 $this->inTableBody(array( 3000 'name' => end($this->stack)->nodeName, 3001 'type' => HTML5::ENDTAG 3002 )); 3003 3004 return $this->mainPhase($token); 3005 } 3006 3007 /* An end tag whose tag name is one of: "body", "caption", "col", 3008 "colgroup", "html", "td", "th", "tr" */ 3009 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], 3010 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) { 3011 /* Parse error. Ignore the token. */ 3012 3013 /* Anything else */ 3014 } else { 3015 /* Process the token as if the insertion mode was "in table". */ 3016 $this->inTable($token); 3017 } 3018 } 3019 3020 private function inRow($token) { 3021 $clear = array('tr', 'html'); 3022 3023 /* A start tag whose tag name is one of: "th", "td" */ 3024 if($token['type'] === HTML5::STARTTAG && 3025 ($token['name'] === 'th' || $token['name'] === 'td')) { 3026 /* Clear the stack back to a table row context. */ 3027 $this->clearStackToTableContext($clear); 3028 3029 /* Insert an HTML element for the token, then switch the insertion 3030 mode to "in cell". */ 3031 $this->insertElement($token); 3032 $this->mode = self::IN_CELL; 3033 3034 /* Insert a marker at the end of the list of active formatting 3035 elements. */ 3036 $this->a_formatting[] = self::MARKER; 3037 3038 /* An end tag whose tag name is "tr" */ 3039 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') { 3040 /* If the stack of open elements does not have an element in table 3041 scope with the same tag name as the token, this is a parse error. 3042 Ignore the token. (innerHTML case) */ 3043 if(!$this->elementInScope($token['name'], true)) { 3044 // Ignore. 3045 3046 /* Otherwise: */ 3047 } else { 3048 /* Clear the stack back to a table row context. */ 3049 $this->clearStackToTableContext($clear); 3050 3051 /* Pop the current node (which will be a tr element) from the 3052 stack of open elements. Switch the insertion mode to "in table 3053 body". */ 3054 array_pop($this->stack); 3055 $this->mode = self::IN_TBODY; 3056 } 3057 3058 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3059 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */ 3060 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 3061 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) { 3062 /* Act as if an end tag with the tag name "tr" had been seen, then, 3063 if that token wasn't ignored, reprocess the current token. */ 3064 $this->inRow(array( 3065 'name' => 'tr', 3066 'type' => HTML5::ENDTAG 3067 )); 3068 3069 return $this->inCell($token); 3070 3071 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ 3072 } elseif($token['type'] === HTML5::ENDTAG && 3073 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) { 3074 /* If the stack of open elements does not have an element in table 3075 scope with the same tag name as the token, this is a parse error. 3076 Ignore the token. */ 3077 if(!$this->elementInScope($token['name'], true)) { 3078 // Ignore. 3079 3080 /* Otherwise: */ 3081 } else { 3082 /* Otherwise, act as if an end tag with the tag name "tr" had 3083 been seen, then reprocess the current token. */ 3084 $this->inRow(array( 3085 'name' => 'tr', 3086 'type' => HTML5::ENDTAG 3087 )); 3088 3089 return $this->inCell($token); 3090 } 3091 3092 /* An end tag whose tag name is one of: "body", "caption", "col", 3093 "colgroup", "html", "td", "th" */ 3094 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], 3095 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) { 3096 /* Parse error. Ignore the token. */ 3097 3098 /* Anything else */ 3099 } else { 3100 /* Process the token as if the insertion mode was "in table". */ 3101 $this->inTable($token); 3102 } 3103 } 3104 3105 private function inCell($token) { 3106 /* An end tag whose tag name is one of: "td", "th" */ 3107 if($token['type'] === HTML5::ENDTAG && 3108 ($token['name'] === 'td' || $token['name'] === 'th')) { 3109 /* If the stack of open elements does not have an element in table 3110 scope with the same tag name as that of the token, then this is a 3111 parse error and the token must be ignored. */ 3112 if(!$this->elementInScope($token['name'], true)) { 3113 // Ignore. 3114 3115 /* Otherwise: */ 3116 } else { 3117 /* Generate implied end tags, except for elements with the same 3118 tag name as the token. */ 3119 $this->generateImpliedEndTags(array($token['name'])); 3120 3121 /* Now, if the current node is not an element with the same tag 3122 name as the token, then this is a parse error. */ 3123 // k 3124 3125 /* Pop elements from this stack until an element with the same 3126 tag name as the token has been popped from the stack. */ 3127 while(true) { 3128 $node = end($this->stack)->nodeName; 3129 array_pop($this->stack); 3130 3131 if($node === $token['name']) { 3132 break; 3133 } 3134 } 3135 3136 /* Clear the list of active formatting elements up to the last 3137 marker. */ 3138 $this->clearTheActiveFormattingElementsUpToTheLastMarker(); 3139 3140 /* Switch the insertion mode to "in row". (The current node 3141 will be a tr element at this point.) */ 3142 $this->mode = self::IN_ROW; 3143 } 3144 3145 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3146 "tbody", "td", "tfoot", "th", "thead", "tr" */ 3147 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 3148 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 3149 'thead', 'tr'))) { 3150 /* If the stack of open elements does not have a td or th element 3151 in table scope, then this is a parse error; ignore the token. 3152 (innerHTML case) */ 3153 if(!$this->elementInScope(array('td', 'th'), true)) { 3154 // Ignore. 3155 3156 /* Otherwise, close the cell (see below) and reprocess the current 3157 token. */ 3158 } else { 3159 $this->closeCell(); 3160 return $this->inRow($token); 3161 } 3162 3163 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3164 "tbody", "td", "tfoot", "th", "thead", "tr" */ 3165 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 3166 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 3167 'thead', 'tr'))) { 3168 /* If the stack of open elements does not have a td or th element 3169 in table scope, then this is a parse error; ignore the token. 3170 (innerHTML case) */ 3171 if(!$this->elementInScope(array('td', 'th'), true)) { 3172 // Ignore. 3173 3174 /* Otherwise, close the cell (see below) and reprocess the current 3175 token. */ 3176 } else { 3177 $this->closeCell(); 3178 return $this->inRow($token); 3179 } 3180 3181 /* An end tag whose tag name is one of: "body", "caption", "col", 3182 "colgroup", "html" */ 3183 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], 3184 array('body', 'caption', 'col', 'colgroup', 'html'))) { 3185 /* Parse error. Ignore the token. */ 3186 3187 /* An end tag whose tag name is one of: "table", "tbody", "tfoot", 3188 "thead", "tr" */ 3189 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], 3190 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) { 3191 /* If the stack of open elements does not have an element in table 3192 scope with the same tag name as that of the token (which can only 3193 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case), 3194 then this is a parse error and the token must be ignored. */ 3195 if(!$this->elementInScope($token['name'], true)) { 3196 // Ignore. 3197 3198 /* Otherwise, close the cell (see below) and reprocess the current 3199 token. */ 3200 } else { 3201 $this->closeCell(); 3202 return $this->inRow($token); 3203 } 3204 3205 /* Anything else */ 3206 } else { 3207 /* Process the token as if the insertion mode was "in body". */ 3208 $this->inBody($token); 3209 } 3210 } 3211 3212 private function inSelect($token) { 3213 /* Handle the token as follows: */ 3214 3215 /* A character token */ 3216 if($token['type'] === HTML5::CHARACTR) { 3217 /* Append the token's character to the current node. */ 3218 $this->insertText($token['data']); 3219 3220 /* A comment token */ 3221 } elseif($token['type'] === HTML5::COMMENT) { 3222 /* Append a Comment node to the current node with the data 3223 attribute set to the data given in the comment token. */ 3224 $this->insertComment($token['data']); 3225 3226 /* A start tag token whose tag name is "option" */ 3227 } elseif($token['type'] === HTML5::STARTTAG && 3228 $token['name'] === 'option') { 3229 /* If the current node is an option element, act as if an end tag 3230 with the tag name "option" had been seen. */ 3231 if(end($this->stack)->nodeName === 'option') { 3232 $this->inSelect(array( 3233 'name' => 'option', 3234 'type' => HTML5::ENDTAG 3235 )); 3236 } 3237 3238 /* Insert an HTML element for the token. */ 3239 $this->insertElement($token); 3240 3241 /* A start tag token whose tag name is "optgroup" */ 3242 } elseif($token['type'] === HTML5::STARTTAG && 3243 $token['name'] === 'optgroup') { 3244 /* If the current node is an option element, act as if an end tag 3245 with the tag name "option" had been seen. */ 3246 if(end($this->stack)->nodeName === 'option') { 3247 $this->inSelect(array( 3248 'name' => 'option', 3249 'type' => HTML5::ENDTAG 3250 )); 3251 } 3252 3253 /* If the current node is an optgroup element, act as if an end tag 3254 with the tag name "optgroup" had been seen. */ 3255 if(end($this->stack)->nodeName === 'optgroup') { 3256 $this->inSelect(array( 3257 'name' => 'optgroup', 3258 'type' => HTML5::ENDTAG 3259 )); 3260 } 3261 3262 /* Insert an HTML element for the token. */ 3263 $this->insertElement($token); 3264 3265 /* An end tag token whose tag name is "optgroup" */ 3266 } elseif($token['type'] === HTML5::ENDTAG && 3267 $token['name'] === 'optgroup') { 3268 /* First, if the current node is an option element, and the node 3269 immediately before it in the stack of open elements is an optgroup 3270 element, then act as if an end tag with the tag name "option" had 3271 been seen. */ 3272 $elements_in_stack = count($this->stack); 3273 3274 if($this->stack[$elements_in_stack - 1]->nodeName === 'option' && 3275 $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') { 3276 $this->inSelect(array( 3277 'name' => 'option', 3278 'type' => HTML5::ENDTAG 3279 )); 3280 } 3281 3282 /* If the current node is an optgroup element, then pop that node 3283 from the stack of open elements. Otherwise, this is a parse error, 3284 ignore the token. */ 3285 if($this->stack[$elements_in_stack - 1] === 'optgroup') { 3286 array_pop($this->stack); 3287 } 3288 3289 /* An end tag token whose tag name is "option" */ 3290 } elseif($token['type'] === HTML5::ENDTAG && 3291 $token['name'] === 'option') { 3292 /* If the current node is an option element, then pop that node 3293 from the stack of open elements. Otherwise, this is a parse error, 3294 ignore the token. */ 3295 if(end($this->stack)->nodeName === 'option') { 3296 array_pop($this->stack); 3297 } 3298 3299 /* An end tag whose tag name is "select" */ 3300 } elseif($token['type'] === HTML5::ENDTAG && 3301 $token['name'] === 'select') { 3302 /* If the stack of open elements does not have an element in table 3303 scope with the same tag name as the token, this is a parse error. 3304 Ignore the token. (innerHTML case) */ 3305 if(!$this->elementInScope($token['name'], true)) { 3306 // w/e 3307 3308 /* Otherwise: */ 3309 } else { 3310 /* Pop elements from the stack of open elements until a select 3311 element has been popped from the stack. */ 3312 while(true) { 3313 $current = end($this->stack)->nodeName; 3314 array_pop($this->stack); 3315 3316 if($current === 'select') { 3317 break; 3318 } 3319 } 3320 3321 /* Reset the insertion mode appropriately. */ 3322 $this->resetInsertionMode(); 3323 } 3324 3325 /* A start tag whose tag name is "select" */ 3326 } elseif($token['name'] === 'select' && 3327 $token['type'] === HTML5::STARTTAG) { 3328 /* Parse error. Act as if the token had been an end tag with the 3329 tag name "select" instead. */ 3330 $this->inSelect(array( 3331 'name' => 'select', 3332 'type' => HTML5::ENDTAG 3333 )); 3334 3335 /* An end tag whose tag name is one of: "caption", "table", "tbody", 3336 "tfoot", "thead", "tr", "td", "th" */ 3337 } elseif(in_array($token['name'], array('caption', 'table', 'tbody', 3338 'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) { 3339 /* Parse error. */ 3340 // w/e 3341 3342 /* If the stack of open elements has an element in table scope with 3343 the same tag name as that of the token, then act as if an end tag 3344 with the tag name "select" had been seen, and reprocess the token. 3345 Otherwise, ignore the token. */ 3346 if($this->elementInScope($token['name'], true)) { 3347 $this->inSelect(array( 3348 'name' => 'select', 3349 'type' => HTML5::ENDTAG 3350 )); 3351 3352 $this->mainPhase($token); 3353 } 3354 3355 /* Anything else */ 3356 } else { 3357 /* Parse error. Ignore the token. */ 3358 } 3359 } 3360 3361 private function afterBody($token) { 3362 /* Handle the token as follows: */ 3363 3364 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 3365 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3366 or U+0020 SPACE */ 3367 if($token['type'] === HTML5::CHARACTR && 3368 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 3369 /* Process the token as it would be processed if the insertion mode 3370 was "in body". */ 3371 $this->inBody($token); 3372 3373 /* A comment token */ 3374 } elseif($token['type'] === HTML5::COMMENT) { 3375 /* Append a Comment node to the first element in the stack of open 3376 elements (the html element), with the data attribute set to the 3377 data given in the comment token. */ 3378 $comment = $this->dom->createComment($token['data']); 3379 $this->stack[0]->appendChild($comment); 3380 3381 /* An end tag with the tag name "html" */ 3382 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') { 3383 /* If the parser was originally created in order to handle the 3384 setting of an element's innerHTML attribute, this is a parse error; 3385 ignore the token. (The element will be an html element in this 3386 case.) (innerHTML case) */ 3387 3388 /* Otherwise, switch to the trailing end phase. */ 3389 $this->phase = self::END_PHASE; 3390 3391 /* Anything else */ 3392 } else { 3393 /* Parse error. Set the insertion mode to "in body" and reprocess 3394 the token. */ 3395 $this->mode = self::IN_BODY; 3396 return $this->inBody($token); 3397 } 3398 } 3399 3400 private function inFrameset($token) { 3401 /* Handle the token as follows: */ 3402 3403 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 3404 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3405 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ 3406 if($token['type'] === HTML5::CHARACTR && 3407 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 3408 /* Append the character to the current node. */ 3409 $this->insertText($token['data']); 3410 3411 /* A comment token */ 3412 } elseif($token['type'] === HTML5::COMMENT) { 3413 /* Append a Comment node to the current node with the data 3414 attribute set to the data given in the comment token. */ 3415 $this->insertComment($token['data']); 3416 3417 /* A start tag with the tag name "frameset" */ 3418 } elseif($token['name'] === 'frameset' && 3419 $token['type'] === HTML5::STARTTAG) { 3420 $this->insertElement($token); 3421 3422 /* An end tag with the tag name "frameset" */ 3423 } elseif($token['name'] === 'frameset' && 3424 $token['type'] === HTML5::ENDTAG) { 3425 /* If the current node is the root html element, then this is a 3426 parse error; ignore the token. (innerHTML case) */ 3427 if(end($this->stack)->nodeName === 'html') { 3428 // Ignore 3429 3430 } else { 3431 /* Otherwise, pop the current node from the stack of open 3432 elements. */ 3433 array_pop($this->stack); 3434 3435 /* If the parser was not originally created in order to handle 3436 the setting of an element's innerHTML attribute (innerHTML case), 3437 and the current node is no longer a frameset element, then change 3438 the insertion mode to "after frameset". */ 3439 $this->mode = self::AFTR_FRAME; 3440 } 3441 3442 /* A start tag with the tag name "frame" */ 3443 } elseif($token['name'] === 'frame' && 3444 $token['type'] === HTML5::STARTTAG) { 3445 /* Insert an HTML element for the token. */ 3446 $this->insertElement($token); 3447 3448 /* Immediately pop the current node off the stack of open elements. */ 3449 array_pop($this->stack); 3450 3451 /* A start tag with the tag name "noframes" */ 3452 } elseif($token['name'] === 'noframes' && 3453 $token['type'] === HTML5::STARTTAG) { 3454 /* Process the token as if the insertion mode had been "in body". */ 3455 $this->inBody($token); 3456 3457 /* Anything else */ 3458 } else { 3459 /* Parse error. Ignore the token. */ 3460 } 3461 } 3462 3463 private function afterFrameset($token) { 3464 /* Handle the token as follows: */ 3465 3466 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 3467 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3468 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ 3469 if($token['type'] === HTML5::CHARACTR && 3470 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 3471 /* Append the character to the current node. */ 3472 $this->insertText($token['data']); 3473 3474 /* A comment token */ 3475 } elseif($token['type'] === HTML5::COMMENT) { 3476 /* Append a Comment node to the current node with the data 3477 attribute set to the data given in the comment token. */ 3478 $this->insertComment($token['data']); 3479 3480 /* An end tag with the tag name "html" */ 3481 } elseif($token['name'] === 'html' && 3482 $token['type'] === HTML5::ENDTAG) { 3483 /* Switch to the trailing end phase. */ 3484 $this->phase = self::END_PHASE; 3485 3486 /* A start tag with the tag name "noframes" */ 3487 } elseif($token['name'] === 'noframes' && 3488 $token['type'] === HTML5::STARTTAG) { 3489 /* Process the token as if the insertion mode had been "in body". */ 3490 $this->inBody($token); 3491 3492 /* Anything else */ 3493 } else { 3494 /* Parse error. Ignore the token. */ 3495 } 3496 } 3497 3498 private function trailingEndPhase($token) { 3499 /* After the main phase, as each token is emitted from the tokenisation 3500 stage, it must be processed as described in this section. */ 3501 3502 /* A DOCTYPE token */ 3503 if($token['type'] === HTML5::DOCTYPE) { 3504 // Parse error. Ignore the token. 3505 3506 /* A comment token */ 3507 } elseif($token['type'] === HTML5::COMMENT) { 3508 /* Append a Comment node to the Document object with the data 3509 attribute set to the data given in the comment token. */ 3510 $comment = $this->dom->createComment($token['data']); 3511 $this->dom->appendChild($comment); 3512 3513 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 3514 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3515 or U+0020 SPACE */ 3516 } elseif($token['type'] === HTML5::CHARACTR && 3517 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 3518 /* Process the token as it would be processed in the main phase. */ 3519 $this->mainPhase($token); 3520 3521 /* A character token that is not one of U+0009 CHARACTER TABULATION, 3522 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3523 or U+0020 SPACE. Or a start tag token. Or an end tag token. */ 3524 } elseif(($token['type'] === HTML5::CHARACTR && 3525 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || 3526 $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) { 3527 /* Parse error. Switch back to the main phase and reprocess the 3528 token. */ 3529 $this->phase = self::MAIN_PHASE; 3530 return $this->mainPhase($token); 3531 3532 /* An end-of-file token */ 3533 } elseif($token['type'] === HTML5::EOF) { 3534 /* OMG DONE!! */ 3535 } 3536 } 3537 3538 private function insertElement($token, $append = true, $check = false) { 3539 // Proprietary workaround for libxml2's limitations with tag names 3540 if ($check) { 3541 // Slightly modified HTML5 tag-name modification, 3542 // removing anything that's not an ASCII letter, digit, or hyphen 3543 $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']); 3544 // Remove leading hyphens and numbers 3545 $token['name'] = ltrim($token['name'], '-0..9'); 3546 // In theory, this should ever be needed, but just in case 3547 if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice 3548 } 3549 3550 $el = $this->dom->createElement($token['name']); 3551 3552 foreach($token['attr'] as $attr) { 3553 if(!$el->hasAttribute($attr['name'])) { 3554 $el->setAttribute($attr['name'], $attr['value']); 3555 } 3556 } 3557 3558 $this->appendToRealParent($el); 3559 $this->stack[] = $el; 3560 3561 return $el; 3562 } 3563 3564 private function insertText($data) { 3565 $text = $this->dom->createTextNode($data); 3566 $this->appendToRealParent($text); 3567 } 3568 3569 private function insertComment($data) { 3570 $comment = $this->dom->createComment($data); 3571 $this->appendToRealParent($comment); 3572 } 3573 3574 private function appendToRealParent($node) { 3575 if($this->foster_parent === null) { 3576 end($this->stack)->appendChild($node); 3577 3578 } elseif($this->foster_parent !== null) { 3579 /* If the foster parent element is the parent element of the 3580 last table element in the stack of open elements, then the new 3581 node must be inserted immediately before the last table element 3582 in the stack of open elements in the foster parent element; 3583 otherwise, the new node must be appended to the foster parent 3584 element. */ 3585 for($n = count($this->stack) - 1; $n >= 0; $n--) { 3586 if($this->stack[$n]->nodeName === 'table' && 3587 $this->stack[$n]->parentNode !== null) { 3588 $table = $this->stack[$n]; 3589 break; 3590 } 3591 } 3592 3593 if(isset($table) && $this->foster_parent->isSameNode($table->parentNode)) 3594 $this->foster_parent->insertBefore($node, $table); 3595 else 3596 $this->foster_parent->appendChild($node); 3597 3598 $this->foster_parent = null; 3599 } 3600 } 3601 3602 private function elementInScope($el, $table = false) { 3603 if(is_array($el)) { 3604 foreach($el as $element) { 3605 if($this->elementInScope($element, $table)) { 3606 return true; 3607 } 3608 } 3609 3610 return false; 3611 } 3612 3613 $leng = count($this->stack); 3614 3615 for($n = 0; $n < $leng; $n++) { 3616 /* 1. Initialise node to be the current node (the bottommost node of 3617 the stack). */ 3618 $node = $this->stack[$leng - 1 - $n]; 3619 3620 if($node->tagName === $el) { 3621 /* 2. If node is the target node, terminate in a match state. */ 3622 return true; 3623 3624 } elseif($node->tagName === 'table') { 3625 /* 3. Otherwise, if node is a table element, terminate in a failure 3626 state. */ 3627 return false; 3628 3629 } elseif($table === true && in_array($node->tagName, array('caption', 'td', 3630 'th', 'button', 'marquee', 'object'))) { 3631 /* 4. Otherwise, if the algorithm is the "has an element in scope" 3632 variant (rather than the "has an element in table scope" variant), 3633 and node is one of the following, terminate in a failure state. */ 3634 return false; 3635 3636 } elseif($node === $node->ownerDocument->documentElement) { 3637 /* 5. Otherwise, if node is an html element (root element), terminate 3638 in a failure state. (This can only happen if the node is the topmost 3639 node of the stack of open elements, and prevents the next step from 3640 being invoked if there are no more elements in the stack.) */ 3641 return false; 3642 } 3643 3644 /* Otherwise, set node to the previous entry in the stack of open 3645 elements and return to step 2. (This will never fail, since the loop 3646 will always terminate in the previous step if the top of the stack 3647 is reached.) */ 3648 } 3649 } 3650 3651 private function reconstructActiveFormattingElements() { 3652 /* 1. If there are no entries in the list of active formatting elements, 3653 then there is nothing to reconstruct; stop this algorithm. */ 3654 $formatting_elements = count($this->a_formatting); 3655 3656 if($formatting_elements === 0) { 3657 return false; 3658 } 3659 3660 /* 3. Let entry be the last (most recently added) element in the list 3661 of active formatting elements. */ 3662 $entry = end($this->a_formatting); 3663 3664 /* 2. If the last (most recently added) entry in the list of active 3665 formatting elements is a marker, or if it is an element that is in the 3666 stack of open elements, then there is nothing to reconstruct; stop this 3667 algorithm. */ 3668 if($entry === self::MARKER || in_array($entry, $this->stack, true)) { 3669 return false; 3670 } 3671 3672 for($a = $formatting_elements - 1; $a >= 0; true) { 3673 /* 4. If there are no entries before entry in the list of active 3674 formatting elements, then jump to step 8. */ 3675 if($a === 0) { 3676 $step_seven = false; 3677 break; 3678 } 3679 3680 /* 5. Let entry be the entry one earlier than entry in the list of 3681 active formatting elements. */ 3682 $a--; 3683 $entry = $this->a_formatting[$a]; 3684 3685 /* 6. If entry is neither a marker nor an element that is also in 3686 thetack of open elements, go to step 4. */ 3687 if($entry === self::MARKER || in_array($entry, $this->stack, true)) { 3688 break; 3689 } 3690 } 3691 3692 while(true) { 3693 /* 7. Let entry be the element one later than entry in the list of 3694 active formatting elements. */ 3695 if(isset($step_seven) && $step_seven === true) { 3696 $a++; 3697 $entry = $this->a_formatting[$a]; 3698 } 3699 3700 /* 8. Perform a shallow clone of the element entry to obtain clone. */ 3701 $clone = $entry->cloneNode(); 3702 3703 /* 9. Append clone to the current node and push it onto the stack 3704 of open elements so that it is the new current node. */ 3705 end($this->stack)->appendChild($clone); 3706 $this->stack[] = $clone; 3707 3708 /* 10. Replace the entry for entry in the list with an entry for 3709 clone. */ 3710 $this->a_formatting[$a] = $clone; 3711 3712 /* 11. If the entry for clone in the list of active formatting 3713 elements is not the last entry in the list, return to step 7. */ 3714 if(end($this->a_formatting) !== $clone) { 3715 $step_seven = true; 3716 } else { 3717 break; 3718 } 3719 } 3720 } 3721 3722 private function clearTheActiveFormattingElementsUpToTheLastMarker() { 3723 /* When the steps below require the UA to clear the list of active 3724 formatting elements up to the last marker, the UA must perform the 3725 following steps: */ 3726 3727 while(true) { 3728 /* 1. Let entry be the last (most recently added) entry in the list 3729 of active formatting elements. */ 3730 $entry = end($this->a_formatting); 3731 3732 /* 2. Remove entry from the list of active formatting elements. */ 3733 array_pop($this->a_formatting); 3734 3735 /* 3. If entry was a marker, then stop the algorithm at this point. 3736 The list has been cleared up to the last marker. */ 3737 if($entry === self::MARKER) { 3738 break; 3739 } 3740 } 3741 } 3742 3743 private function generateImpliedEndTags($exclude = array()) { 3744 /* When the steps below require the UA to generate implied end tags, 3745 then, if the current node is a dd element, a dt element, an li element, 3746 a p element, a td element, a th element, or a tr element, the UA must 3747 act as if an end tag with the respective tag name had been seen and 3748 then generate implied end tags again. */ 3749 $node = end($this->stack); 3750 $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude); 3751 3752 while(in_array(end($this->stack)->nodeName, $elements)) { 3753 array_pop($this->stack); 3754 } 3755 } 3756 3757 private function getElementCategory($node) { 3758 $name = $node->tagName; 3759 if(in_array($name, $this->special)) 3760 return self::SPECIAL; 3761 3762 elseif(in_array($name, $this->scoping)) 3763 return self::SCOPING; 3764 3765 elseif(in_array($name, $this->formatting)) 3766 return self::FORMATTING; 3767 3768 else 3769 return self::PHRASING; 3770 } 3771 3772 private function clearStackToTableContext($elements) { 3773 /* When the steps above require the UA to clear the stack back to a 3774 table context, it means that the UA must, while the current node is not 3775 a table element or an html element, pop elements from the stack of open 3776 elements. If this causes any elements to be popped from the stack, then 3777 this is a parse error. */ 3778 while(true) { 3779 $node = end($this->stack)->nodeName; 3780 3781 if(in_array($node, $elements)) { 3782 break; 3783 } else { 3784 array_pop($this->stack); 3785 } 3786 } 3787 } 3788 3789 private function resetInsertionMode() { 3790 /* 1. Let last be false. */ 3791 $last = false; 3792 $leng = count($this->stack); 3793 3794 for($n = $leng - 1; $n >= 0; $n--) { 3795 /* 2. Let node be the last node in the stack of open elements. */ 3796 $node = $this->stack[$n]; 3797 3798 /* 3. If node is the first node in the stack of open elements, then 3799 set last to true. If the element whose innerHTML attribute is being 3800 set is neither a td element nor a th element, then set node to the 3801 element whose innerHTML attribute is being set. (innerHTML case) */ 3802 if($this->stack[0]->isSameNode($node)) { 3803 $last = true; 3804 } 3805 3806 /* 4. If node is a select element, then switch the insertion mode to 3807 "in select" and abort these steps. (innerHTML case) */ 3808 if($node->nodeName === 'select') { 3809 $this->mode = self::IN_SELECT; 3810 break; 3811 3812 /* 5. If node is a td or th element, then switch the insertion mode 3813 to "in cell" and abort these steps. */ 3814 } elseif($node->nodeName === 'td' || $node->nodeName === 'th') { 3815 $this->mode = self::IN_CELL; 3816 break; 3817 3818 /* 6. If node is a tr element, then switch the insertion mode to 3819 "in row" and abort these steps. */ 3820 } elseif($node->nodeName === 'tr') { 3821 $this->mode = self::IN_ROW; 3822 break; 3823 3824 /* 7. If node is a tbody, thead, or tfoot element, then switch the 3825 insertion mode to "in table body" and abort these steps. */ 3826 } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) { 3827 $this->mode = self::IN_TBODY; 3828 break; 3829 3830 /* 8. If node is a caption element, then switch the insertion mode 3831 to "in caption" and abort these steps. */ 3832 } elseif($node->nodeName === 'caption') { 3833 $this->mode = self::IN_CAPTION; 3834 break; 3835 3836 /* 9. If node is a colgroup element, then switch the insertion mode 3837 to "in column group" and abort these steps. (innerHTML case) */ 3838 } elseif($node->nodeName === 'colgroup') { 3839 $this->mode = self::IN_CGROUP; 3840 break; 3841 3842 /* 10. If node is a table element, then switch the insertion mode 3843 to "in table" and abort these steps. */ 3844 } elseif($node->nodeName === 'table') { 3845 $this->mode = self::IN_TABLE; 3846 break; 3847 3848 /* 11. If node is a head element, then switch the insertion mode 3849 to "in body" ("in body"! not "in head"!) and abort these steps. 3850 (innerHTML case) */ 3851 } elseif($node->nodeName === 'head') { 3852 $this->mode = self::IN_BODY; 3853 break; 3854 3855 /* 12. If node is a body element, then switch the insertion mode to 3856 "in body" and abort these steps. */ 3857 } elseif($node->nodeName === 'body') { 3858 $this->mode = self::IN_BODY; 3859 break; 3860 3861 /* 13. If node is a frameset element, then switch the insertion 3862 mode to "in frameset" and abort these steps. (innerHTML case) */ 3863 } elseif($node->nodeName === 'frameset') { 3864 $this->mode = self::IN_FRAME; 3865 break; 3866 3867 /* 14. If node is an html element, then: if the head element 3868 pointer is null, switch the insertion mode to "before head", 3869 otherwise, switch the insertion mode to "after head". In either 3870 case, abort these steps. (innerHTML case) */ 3871 } elseif($node->nodeName === 'html') { 3872 $this->mode = ($this->head_pointer === null) 3873 ? self::BEFOR_HEAD 3874 : self::AFTER_HEAD; 3875 3876 break; 3877 3878 /* 15. If last is true, then set the insertion mode to "in body" 3879 and abort these steps. (innerHTML case) */ 3880 } elseif($last) { 3881 $this->mode = self::IN_BODY; 3882 break; 3883 } 3884 } 3885 } 3886 3887 private function closeCell() { 3888 /* If the stack of open elements has a td or th element in table scope, 3889 then act as if an end tag token with that tag name had been seen. */ 3890 foreach(array('td', 'th') as $cell) { 3891 if($this->elementInScope($cell, true)) { 3892 $this->inCell(array( 3893 'name' => $cell, 3894 'type' => HTML5::ENDTAG 3895 )); 3896 3897 break; 3898 } 3899 } 3900 } 3901 3902 public function save() { 3903 return $this->dom; 3904 } 3905 } 3906 ?>
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Fri Nov 28 20:08:37 2014 | Cross-referenced by PHPXref 0.7.1 |