[ Index ] |
PHP Cross Reference of vtigercrm-6.1.0 |
[Summary view] [Print] [Text view]
1 <?php 2 3 /** 4 * Our in-house implementation of a parser. 5 * 6 * A pure PHP parser, DirectLex has absolutely no dependencies, making 7 * it a reasonably good default for PHP4. Written with efficiency in mind, 8 * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it 9 * pales in comparison to HTMLPurifier_Lexer_DOMLex. 10 * 11 * @todo Reread XML spec and document differences. 12 */ 13 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer 14 { 15 16 public $tracksLineNumbers = true; 17 18 /** 19 * Whitespace characters for str(c)spn. 20 */ 21 protected $_whitespace = "\x20\x09\x0D\x0A"; 22 23 /** 24 * Callback function for script CDATA fudge 25 * @param $matches, in form of array(opening tag, contents, closing tag) 26 */ 27 protected function scriptCallback($matches) { 28 return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3]; 29 } 30 31 public function tokenizeHTML($html, $config, $context) { 32 33 // special normalization for script tags without any armor 34 // our "armor" heurstic is a < sign any number of whitespaces after 35 // the first script tag 36 if ($config->get('HTML', 'Trusted')) { 37 $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si', 38 array($this, 'scriptCallback'), $html); 39 } 40 41 $html = $this->normalize($html, $config, $context); 42 43 $cursor = 0; // our location in the text 44 $inside_tag = false; // whether or not we're parsing the inside of a tag 45 $array = array(); // result array 46 47 // This is also treated to mean maintain *column* numbers too 48 $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers'); 49 50 if ($maintain_line_numbers === null) { 51 // automatically determine line numbering by checking 52 // if error collection is on 53 $maintain_line_numbers = $config->get('Core', 'CollectErrors'); 54 } 55 56 if ($maintain_line_numbers) { 57 $current_line = 1; 58 $current_col = 0; 59 $length = strlen($html); 60 } else { 61 $current_line = false; 62 $current_col = false; 63 $length = false; 64 } 65 $context->register('CurrentLine', $current_line); 66 $context->register('CurrentCol', $current_col); 67 $nl = "\n"; 68 // how often to manually recalculate. This will ALWAYS be right, 69 // but it's pretty wasteful. Set to 0 to turn off 70 $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval'); 71 72 $e = false; 73 if ($config->get('Core', 'CollectErrors')) { 74 $e =& $context->get('ErrorCollector'); 75 } 76 77 // for testing synchronization 78 $loops = 0; 79 80 while(++$loops) { 81 82 // $cursor is either at the start of a token, or inside of 83 // a tag (i.e. there was a < immediately before it), as indicated 84 // by $inside_tag 85 86 if ($maintain_line_numbers) { 87 88 // $rcursor, however, is always at the start of a token. 89 $rcursor = $cursor - (int) $inside_tag; 90 91 // Column number is cheap, so we calculate it every round. 92 // We're interested at the *end* of the newline string, so 93 // we need to add strlen($nl) == 1 to $nl_pos before subtracting it 94 // from our "rcursor" position. 95 $nl_pos = strrpos($html, $nl, $rcursor - $length); 96 $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); 97 98 // recalculate lines 99 if ( 100 $synchronize_interval && // synchronization is on 101 $cursor > 0 && // cursor is further than zero 102 $loops % $synchronize_interval === 0 // time to synchronize! 103 ) { 104 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); 105 } 106 107 } 108 109 $position_next_lt = strpos($html, '<', $cursor); 110 $position_next_gt = strpos($html, '>', $cursor); 111 112 // triggers on "<b>asdf</b>" but not "asdf <b></b>" 113 // special case to set up context 114 if ($position_next_lt === $cursor) { 115 $inside_tag = true; 116 $cursor++; 117 } 118 119 if (!$inside_tag && $position_next_lt !== false) { 120 // We are not inside tag and there still is another tag to parse 121 $token = new 122 HTMLPurifier_Token_Text( 123 $this->parseData( 124 substr( 125 $html, $cursor, $position_next_lt - $cursor 126 ) 127 ) 128 ); 129 if ($maintain_line_numbers) { 130 $token->rawPosition($current_line, $current_col); 131 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); 132 } 133 $array[] = $token; 134 $cursor = $position_next_lt + 1; 135 $inside_tag = true; 136 continue; 137 } elseif (!$inside_tag) { 138 // We are not inside tag but there are no more tags 139 // If we're already at the end, break 140 if ($cursor === strlen($html)) break; 141 // Create Text of rest of string 142 $token = new 143 HTMLPurifier_Token_Text( 144 $this->parseData( 145 substr( 146 $html, $cursor 147 ) 148 ) 149 ); 150 if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col); 151 $array[] = $token; 152 break; 153 } elseif ($inside_tag && $position_next_gt !== false) { 154 // We are in tag and it is well formed 155 // Grab the internals of the tag 156 $strlen_segment = $position_next_gt - $cursor; 157 158 if ($strlen_segment < 1) { 159 // there's nothing to process! 160 $token = new HTMLPurifier_Token_Text('<'); 161 $cursor++; 162 continue; 163 } 164 165 $segment = substr($html, $cursor, $strlen_segment); 166 167 if ($segment === false) { 168 // somehow, we attempted to access beyond the end of 169 // the string, defense-in-depth, reported by Nate Abele 170 break; 171 } 172 173 // Check if it's a comment 174 if ( 175 substr($segment, 0, 3) === '!--' 176 ) { 177 // re-determine segment length, looking for --> 178 $position_comment_end = strpos($html, '-->', $cursor); 179 if ($position_comment_end === false) { 180 // uh oh, we have a comment that extends to 181 // infinity. Can't be helped: set comment 182 // end position to end of string 183 if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment'); 184 $position_comment_end = strlen($html); 185 $end = true; 186 } else { 187 $end = false; 188 } 189 $strlen_segment = $position_comment_end - $cursor; 190 $segment = substr($html, $cursor, $strlen_segment); 191 $token = new 192 HTMLPurifier_Token_Comment( 193 substr( 194 $segment, 3, $strlen_segment - 3 195 ) 196 ); 197 if ($maintain_line_numbers) { 198 $token->rawPosition($current_line, $current_col); 199 $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); 200 } 201 $array[] = $token; 202 $cursor = $end ? $position_comment_end : $position_comment_end + 3; 203 $inside_tag = false; 204 continue; 205 } 206 207 // Check if it's an end tag 208 $is_end_tag = (strpos($segment,'/') === 0); 209 if ($is_end_tag) { 210 $type = substr($segment, 1); 211 $token = new HTMLPurifier_Token_End($type); 212 if ($maintain_line_numbers) { 213 $token->rawPosition($current_line, $current_col); 214 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 215 } 216 $array[] = $token; 217 $inside_tag = false; 218 $cursor = $position_next_gt + 1; 219 continue; 220 } 221 222 // Check leading character is alnum, if not, we may 223 // have accidently grabbed an emoticon. Translate into 224 // text and go our merry way 225 if (!ctype_alpha($segment[0])) { 226 // XML: $segment[0] !== '_' && $segment[0] !== ':' 227 if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt'); 228 $token = new HTMLPurifier_Token_Text('<'); 229 if ($maintain_line_numbers) { 230 $token->rawPosition($current_line, $current_col); 231 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 232 } 233 $array[] = $token; 234 $inside_tag = false; 235 continue; 236 } 237 238 // Check if it is explicitly self closing, if so, remove 239 // trailing slash. Remember, we could have a tag like <br>, so 240 // any later token processing scripts must convert improperly 241 // classified EmptyTags from StartTags. 242 $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1); 243 if ($is_self_closing) { 244 $strlen_segment--; 245 $segment = substr($segment, 0, $strlen_segment); 246 } 247 248 // Check if there are any attributes 249 $position_first_space = strcspn($segment, $this->_whitespace); 250 251 if ($position_first_space >= $strlen_segment) { 252 if ($is_self_closing) { 253 $token = new HTMLPurifier_Token_Empty($segment); 254 } else { 255 $token = new HTMLPurifier_Token_Start($segment); 256 } 257 if ($maintain_line_numbers) { 258 $token->rawPosition($current_line, $current_col); 259 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 260 } 261 $array[] = $token; 262 $inside_tag = false; 263 $cursor = $position_next_gt + 1; 264 continue; 265 } 266 267 // Grab out all the data 268 $type = substr($segment, 0, $position_first_space); 269 $attribute_string = 270 trim( 271 substr( 272 $segment, $position_first_space 273 ) 274 ); 275 if ($attribute_string) { 276 $attr = $this->parseAttributeString( 277 $attribute_string 278 , $config, $context 279 ); 280 } else { 281 $attr = array(); 282 } 283 284 if ($is_self_closing) { 285 $token = new HTMLPurifier_Token_Empty($type, $attr); 286 } else { 287 $token = new HTMLPurifier_Token_Start($type, $attr); 288 } 289 if ($maintain_line_numbers) { 290 $token->rawPosition($current_line, $current_col); 291 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 292 } 293 $array[] = $token; 294 $cursor = $position_next_gt + 1; 295 $inside_tag = false; 296 continue; 297 } else { 298 // inside tag, but there's no ending > sign 299 if ($e) $e->send(E_WARNING, 'Lexer: Missing gt'); 300 $token = new 301 HTMLPurifier_Token_Text( 302 '<' . 303 $this->parseData( 304 substr($html, $cursor) 305 ) 306 ); 307 if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col); 308 // no cursor scroll? Hmm... 309 $array[] = $token; 310 break; 311 } 312 break; 313 } 314 315 $context->destroy('CurrentLine'); 316 $context->destroy('CurrentCol'); 317 return $array; 318 } 319 320 /** 321 * PHP 5.0.x compatible substr_count that implements offset and length 322 */ 323 protected function substrCount($haystack, $needle, $offset, $length) { 324 static $oldVersion; 325 if ($oldVersion === null) { 326 $oldVersion = version_compare(PHP_VERSION, '5.1', '<'); 327 } 328 if ($oldVersion) { 329 $haystack = substr($haystack, $offset, $length); 330 return substr_count($haystack, $needle); 331 } else { 332 return substr_count($haystack, $needle, $offset, $length); 333 } 334 } 335 336 /** 337 * Takes the inside of an HTML tag and makes an assoc array of attributes. 338 * 339 * @param $string Inside of tag excluding name. 340 * @returns Assoc array of attributes. 341 */ 342 public function parseAttributeString($string, $config, $context) { 343 $string = (string) $string; // quick typecast 344 345 if ($string == '') return array(); // no attributes 346 347 $e = false; 348 if ($config->get('Core', 'CollectErrors')) { 349 $e =& $context->get('ErrorCollector'); 350 } 351 352 // let's see if we can abort as quickly as possible 353 // one equal sign, no spaces => one attribute 354 $num_equal = substr_count($string, '='); 355 $has_space = strpos($string, ' '); 356 if ($num_equal === 0 && !$has_space) { 357 // bool attribute 358 return array($string => $string); 359 } elseif ($num_equal === 1 && !$has_space) { 360 // only one attribute 361 list($key, $quoted_value) = explode('=', $string); 362 $quoted_value = trim($quoted_value); 363 if (!$key) { 364 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); 365 return array(); 366 } 367 if (!$quoted_value) return array($key => ''); 368 $first_char = @$quoted_value[0]; 369 $last_char = @$quoted_value[strlen($quoted_value)-1]; 370 371 $same_quote = ($first_char == $last_char); 372 $open_quote = ($first_char == '"' || $first_char == "'"); 373 374 if ( $same_quote && $open_quote) { 375 // well behaved 376 $value = substr($quoted_value, 1, strlen($quoted_value) - 2); 377 } else { 378 // not well behaved 379 if ($open_quote) { 380 if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote'); 381 $value = substr($quoted_value, 1); 382 } else { 383 $value = $quoted_value; 384 } 385 } 386 if ($value === false) $value = ''; 387 return array($key => $value); 388 } 389 390 // setup loop environment 391 $array = array(); // return assoc array of attributes 392 $cursor = 0; // current position in string (moves forward) 393 $size = strlen($string); // size of the string (stays the same) 394 395 // if we have unquoted attributes, the parser expects a terminating 396 // space, so let's guarantee that there's always a terminating space. 397 $string .= ' '; 398 399 while(true) { 400 401 if ($cursor >= $size) { 402 break; 403 } 404 405 $cursor += ($value = strspn($string, $this->_whitespace, $cursor)); 406 // grab the key 407 408 $key_begin = $cursor; //we're currently at the start of the key 409 410 // scroll past all characters that are the key (not whitespace or =) 411 $cursor += strcspn($string, $this->_whitespace . '=', $cursor); 412 413 $key_end = $cursor; // now at the end of the key 414 415 $key = substr($string, $key_begin, $key_end - $key_begin); 416 417 if (!$key) { 418 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); 419 $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop 420 continue; // empty key 421 } 422 423 // scroll past all whitespace 424 $cursor += strspn($string, $this->_whitespace, $cursor); 425 426 if ($cursor >= $size) { 427 $array[$key] = $key; 428 break; 429 } 430 431 // if the next character is an equal sign, we've got a regular 432 // pair, otherwise, it's a bool attribute 433 $first_char = @$string[$cursor]; 434 435 if ($first_char == '=') { 436 // key="value" 437 438 $cursor++; 439 $cursor += strspn($string, $this->_whitespace, $cursor); 440 441 if ($cursor === false) { 442 $array[$key] = ''; 443 break; 444 } 445 446 // we might be in front of a quote right now 447 448 $char = @$string[$cursor]; 449 450 if ($char == '"' || $char == "'") { 451 // it's quoted, end bound is $char 452 $cursor++; 453 $value_begin = $cursor; 454 $cursor = strpos($string, $char, $cursor); 455 $value_end = $cursor; 456 } else { 457 // it's not quoted, end bound is whitespace 458 $value_begin = $cursor; 459 $cursor += strcspn($string, $this->_whitespace, $cursor); 460 $value_end = $cursor; 461 } 462 463 // we reached a premature end 464 if ($cursor === false) { 465 $cursor = $size; 466 $value_end = $cursor; 467 } 468 469 $value = substr($string, $value_begin, $value_end - $value_begin); 470 if ($value === false) $value = ''; 471 $array[$key] = $this->parseData($value); 472 $cursor++; 473 474 } else { 475 // boolattr 476 if ($key !== '') { 477 $array[$key] = $key; 478 } else { 479 // purely theoretical 480 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); 481 } 482 483 } 484 } 485 return $array; 486 } 487 488 } 489 490 // vim: et sw=4 sts=4
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Fri Nov 28 20:08:37 2014 | Cross-referenced by PHPXref 0.7.1 |