PHPXRef 0.7.1 : vtigercrm-6.1.0 : /libraries/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  
   3  /**
   4   * Our in-house implementation of a parser.
   5   *
   6   * A pure PHP parser, DirectLex has absolutely no dependencies, making
   7   * it a reasonably good default for PHP4.  Written with efficiency in mind,
   8   * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
   9   * pales in comparison to HTMLPurifier_Lexer_DOMLex.
  10   *
  11   * @todo Reread XML spec and document differences.
  12   */
  13  class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  14  {
  15  
  16      public $tracksLineNumbers = true;
  17  
  18      /**
  19       * Whitespace characters for str(c)spn.
  20       */
  21      protected $_whitespace = "\x20\x09\x0D\x0A";
  22  
  23      /**
  24       * Callback function for script CDATA fudge
  25       * @param $matches, in form of array(opening tag, contents, closing tag)
  26       */
  27      protected function scriptCallback($matches) {
  28          return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
  29      }
  30  
  31      public function tokenizeHTML($html, $config, $context) {
  32  
  33          // special normalization for script tags without any armor
  34          // our "armor" heurstic is a < sign any number of whitespaces after
  35          // the first script tag
  36          if ($config->get('HTML', 'Trusted')) {
  37              $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
  38                  array($this, 'scriptCallback'), $html);
  39          }
  40  
  41          $html = $this->normalize($html, $config, $context);
  42  
  43          $cursor = 0; // our location in the text
  44          $inside_tag = false; // whether or not we're parsing the inside of a tag
  45          $array = array(); // result array
  46  
  47          // This is also treated to mean maintain *column* numbers too
  48          $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
  49  
  50          if ($maintain_line_numbers === null) {
  51              // automatically determine line numbering by checking
  52              // if error collection is on
  53              $maintain_line_numbers = $config->get('Core', 'CollectErrors');
  54          }
  55  
  56          if ($maintain_line_numbers) {
  57              $current_line = 1;
  58              $current_col  = 0;
  59              $length = strlen($html);
  60          } else {
  61              $current_line = false;
  62              $current_col  = false;
  63              $length = false;
  64          }
  65          $context->register('CurrentLine', $current_line);
  66          $context->register('CurrentCol',  $current_col);
  67          $nl = "\n";
  68          // how often to manually recalculate. This will ALWAYS be right,
  69          // but it's pretty wasteful. Set to 0 to turn off
  70          $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
  71  
  72          $e = false;
  73          if ($config->get('Core', 'CollectErrors')) {
  74              $e =& $context->get('ErrorCollector');
  75          }
  76  
  77          // for testing synchronization
  78          $loops = 0;
  79  
  80          while(++$loops) {
  81  
  82              // $cursor is either at the start of a token, or inside of
  83              // a tag (i.e. there was a < immediately before it), as indicated
  84              // by $inside_tag
  85  
  86              if ($maintain_line_numbers) {
  87  
  88                  // $rcursor, however, is always at the start of a token.
  89                  $rcursor = $cursor - (int) $inside_tag;
  90  
  91                  // Column number is cheap, so we calculate it every round.
  92                  // We're interested at the *end* of the newline string, so
  93                  // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
  94                  // from our "rcursor" position.
  95                  $nl_pos = strrpos($html, $nl, $rcursor - $length);
  96                  $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
  97  
  98                  // recalculate lines
  99                  if (
 100                      $synchronize_interval &&  // synchronization is on
 101                      $cursor > 0 &&            // cursor is further than zero
 102                      $loops % $synchronize_interval === 0 // time to synchronize!
 103                  ) {
 104                      $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
 105                  }
 106  
 107              }
 108  
 109              $position_next_lt = strpos($html, '<', $cursor);
 110              $position_next_gt = strpos($html, '>', $cursor);
 111  
 112              // triggers on "<b>asdf</b>" but not "asdf <b></b>"
 113              // special case to set up context
 114              if ($position_next_lt === $cursor) {
 115                  $inside_tag = true;
 116                  $cursor++;
 117              }
 118  
 119              if (!$inside_tag && $position_next_lt !== false) {
 120                  // We are not inside tag and there still is another tag to parse
 121                  $token = new
 122                      HTMLPurifier_Token_Text(
 123                          $this->parseData(
 124                              substr(
 125                                  $html, $cursor, $position_next_lt - $cursor
 126                              )
 127                          )
 128                      );
 129                  if ($maintain_line_numbers) {
 130                      $token->rawPosition($current_line, $current_col);
 131                      $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
 132                  }
 133                  $array[] = $token;
 134                  $cursor  = $position_next_lt + 1;
 135                  $inside_tag = true;
 136                  continue;
 137              } elseif (!$inside_tag) {
 138                  // We are not inside tag but there are no more tags
 139                  // If we're already at the end, break
 140                  if ($cursor === strlen($html)) break;
 141                  // Create Text of rest of string
 142                  $token = new
 143                      HTMLPurifier_Token_Text(
 144                          $this->parseData(
 145                              substr(
 146                                  $html, $cursor
 147                              )
 148                          )
 149                      );
 150                  if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
 151                  $array[] = $token;
 152                  break;
 153              } elseif ($inside_tag && $position_next_gt !== false) {
 154                  // We are in tag and it is well formed
 155                  // Grab the internals of the tag
 156                  $strlen_segment = $position_next_gt - $cursor;
 157  
 158                  if ($strlen_segment < 1) {
 159                      // there's nothing to process!
 160                      $token = new HTMLPurifier_Token_Text('<');
 161                      $cursor++;
 162                      continue;
 163                  }
 164  
 165                  $segment = substr($html, $cursor, $strlen_segment);
 166  
 167                  if ($segment === false) {
 168                      // somehow, we attempted to access beyond the end of
 169                      // the string, defense-in-depth, reported by Nate Abele
 170                      break;
 171                  }
 172  
 173                  // Check if it's a comment
 174                  if (
 175                      substr($segment, 0, 3) === '!--'
 176                  ) {
 177                      // re-determine segment length, looking for -->
 178                      $position_comment_end = strpos($html, '-->', $cursor);
 179                      if ($position_comment_end === false) {
 180                          // uh oh, we have a comment that extends to
 181                          // infinity. Can't be helped: set comment
 182                          // end position to end of string
 183                          if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
 184                          $position_comment_end = strlen($html);
 185                          $end = true;
 186                      } else {
 187                          $end = false;
 188                      }
 189                      $strlen_segment = $position_comment_end - $cursor;
 190                      $segment = substr($html, $cursor, $strlen_segment);
 191                      $token = new
 192                          HTMLPurifier_Token_Comment(
 193                              substr(
 194                                  $segment, 3, $strlen_segment - 3
 195                              )
 196                          );
 197                      if ($maintain_line_numbers) {
 198                          $token->rawPosition($current_line, $current_col);
 199                          $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
 200                      }
 201                      $array[] = $token;
 202                      $cursor = $end ? $position_comment_end : $position_comment_end + 3;
 203                      $inside_tag = false;
 204                      continue;
 205                  }
 206  
 207                  // Check if it's an end tag
 208                  $is_end_tag = (strpos($segment,'/') === 0);
 209                  if ($is_end_tag) {
 210                      $type = substr($segment, 1);
 211                      $token = new HTMLPurifier_Token_End($type);
 212                      if ($maintain_line_numbers) {
 213                          $token->rawPosition($current_line, $current_col);
 214                          $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 215                      }
 216                      $array[] = $token;
 217                      $inside_tag = false;
 218                      $cursor = $position_next_gt + 1;
 219                      continue;
 220                  }
 221  
 222                  // Check leading character is alnum, if not, we may
 223                  // have accidently grabbed an emoticon. Translate into
 224                  // text and go our merry way
 225                  if (!ctype_alpha($segment[0])) {
 226                      // XML:  $segment[0] !== '_' && $segment[0] !== ':'
 227                      if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
 228                      $token = new HTMLPurifier_Token_Text('<');
 229                      if ($maintain_line_numbers) {
 230                          $token->rawPosition($current_line, $current_col);
 231                          $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 232                      }
 233                      $array[] = $token;
 234                      $inside_tag = false;
 235                      continue;
 236                  }
 237  
 238                  // Check if it is explicitly self closing, if so, remove
 239                  // trailing slash. Remember, we could have a tag like <br>, so
 240                  // any later token processing scripts must convert improperly
 241                  // classified EmptyTags from StartTags.
 242                  $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
 243                  if ($is_self_closing) {
 244                      $strlen_segment--;
 245                      $segment = substr($segment, 0, $strlen_segment);
 246                  }
 247  
 248                  // Check if there are any attributes
 249                  $position_first_space = strcspn($segment, $this->_whitespace);
 250  
 251                  if ($position_first_space >= $strlen_segment) {
 252                      if ($is_self_closing) {
 253                          $token = new HTMLPurifier_Token_Empty($segment);
 254                      } else {
 255                          $token = new HTMLPurifier_Token_Start($segment);
 256                      }
 257                      if ($maintain_line_numbers) {
 258                          $token->rawPosition($current_line, $current_col);
 259                          $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 260                      }
 261                      $array[] = $token;
 262                      $inside_tag = false;
 263                      $cursor = $position_next_gt + 1;
 264                      continue;
 265                  }
 266  
 267                  // Grab out all the data
 268                  $type = substr($segment, 0, $position_first_space);
 269                  $attribute_string =
 270                      trim(
 271                          substr(
 272                              $segment, $position_first_space
 273                          )
 274                      );
 275                  if ($attribute_string) {
 276                      $attr = $this->parseAttributeString(
 277                                      $attribute_string
 278                                    , $config, $context
 279                                );
 280                  } else {
 281                      $attr = array();
 282                  }
 283  
 284                  if ($is_self_closing) {
 285                      $token = new HTMLPurifier_Token_Empty($type, $attr);
 286                  } else {
 287                      $token = new HTMLPurifier_Token_Start($type, $attr);
 288                  }
 289                  if ($maintain_line_numbers) {
 290                      $token->rawPosition($current_line, $current_col);
 291                      $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 292                  }
 293                  $array[] = $token;
 294                  $cursor = $position_next_gt + 1;
 295                  $inside_tag = false;
 296                  continue;
 297              } else {
 298                  // inside tag, but there's no ending > sign
 299                  if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
 300                  $token = new
 301                      HTMLPurifier_Token_Text(
 302                          '<' .
 303                          $this->parseData(
 304                              substr($html, $cursor)
 305                          )
 306                      );
 307                  if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
 308                  // no cursor scroll? Hmm...
 309                  $array[] = $token;
 310                  break;
 311              }
 312              break;
 313          }
 314  
 315          $context->destroy('CurrentLine');
 316          $context->destroy('CurrentCol');
 317          return $array;
 318      }
 319  
 320      /**
 321       * PHP 5.0.x compatible substr_count that implements offset and length
 322       */
 323      protected function substrCount($haystack, $needle, $offset, $length) {
 324          static $oldVersion;
 325          if ($oldVersion === null) {
 326              $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
 327          }
 328          if ($oldVersion) {
 329              $haystack = substr($haystack, $offset, $length);
 330              return substr_count($haystack, $needle);
 331          } else {
 332              return substr_count($haystack, $needle, $offset, $length);
 333          }
 334      }
 335  
 336      /**
 337       * Takes the inside of an HTML tag and makes an assoc array of attributes.
 338       *
 339       * @param $string Inside of tag excluding name.
 340       * @returns Assoc array of attributes.
 341       */
 342      public function parseAttributeString($string, $config, $context) {
 343          $string = (string) $string; // quick typecast
 344  
 345          if ($string == '') return array(); // no attributes
 346  
 347          $e = false;
 348          if ($config->get('Core', 'CollectErrors')) {
 349              $e =& $context->get('ErrorCollector');
 350          }
 351  
 352          // let's see if we can abort as quickly as possible
 353          // one equal sign, no spaces => one attribute
 354          $num_equal = substr_count($string, '=');
 355          $has_space = strpos($string, ' ');
 356          if ($num_equal === 0 && !$has_space) {
 357              // bool attribute
 358              return array($string => $string);
 359          } elseif ($num_equal === 1 && !$has_space) {
 360              // only one attribute
 361              list($key, $quoted_value) = explode('=', $string);
 362              $quoted_value = trim($quoted_value);
 363              if (!$key) {
 364                  if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
 365                  return array();
 366              }
 367              if (!$quoted_value) return array($key => '');
 368              $first_char = @$quoted_value[0];
 369              $last_char  = @$quoted_value[strlen($quoted_value)-1];
 370  
 371              $same_quote = ($first_char == $last_char);
 372              $open_quote = ($first_char == '"' || $first_char == "'");
 373  
 374              if ( $same_quote && $open_quote) {
 375                  // well behaved
 376                  $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
 377              } else {
 378                  // not well behaved
 379                  if ($open_quote) {
 380                      if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
 381                      $value = substr($quoted_value, 1);
 382                  } else {
 383                      $value = $quoted_value;
 384                  }
 385              }
 386              if ($value === false) $value = '';
 387              return array($key => $value);
 388          }
 389  
 390          // setup loop environment
 391          $array  = array(); // return assoc array of attributes
 392          $cursor = 0; // current position in string (moves forward)
 393          $size   = strlen($string); // size of the string (stays the same)
 394  
 395          // if we have unquoted attributes, the parser expects a terminating
 396          // space, so let's guarantee that there's always a terminating space.
 397          $string .= ' ';
 398  
 399          while(true) {
 400  
 401              if ($cursor >= $size) {
 402                  break;
 403              }
 404  
 405              $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
 406              // grab the key
 407  
 408              $key_begin = $cursor; //we're currently at the start of the key
 409  
 410              // scroll past all characters that are the key (not whitespace or =)
 411              $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
 412  
 413              $key_end = $cursor; // now at the end of the key
 414  
 415              $key = substr($string, $key_begin, $key_end - $key_begin);
 416  
 417              if (!$key) {
 418                  if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
 419                  $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
 420                  continue; // empty key
 421              }
 422  
 423              // scroll past all whitespace
 424              $cursor += strspn($string, $this->_whitespace, $cursor);
 425  
 426              if ($cursor >= $size) {
 427                  $array[$key] = $key;
 428                  break;
 429              }
 430  
 431              // if the next character is an equal sign, we've got a regular
 432              // pair, otherwise, it's a bool attribute
 433              $first_char = @$string[$cursor];
 434  
 435              if ($first_char == '=') {
 436                  // key="value"
 437  
 438                  $cursor++;
 439                  $cursor += strspn($string, $this->_whitespace, $cursor);
 440  
 441                  if ($cursor === false) {
 442                      $array[$key] = '';
 443                      break;
 444                  }
 445  
 446                  // we might be in front of a quote right now
 447  
 448                  $char = @$string[$cursor];
 449  
 450                  if ($char == '"' || $char == "'") {
 451                      // it's quoted, end bound is $char
 452                      $cursor++;
 453                      $value_begin = $cursor;
 454                      $cursor = strpos($string, $char, $cursor);
 455                      $value_end = $cursor;
 456                  } else {
 457                      // it's not quoted, end bound is whitespace
 458                      $value_begin = $cursor;
 459                      $cursor += strcspn($string, $this->_whitespace, $cursor);
 460                      $value_end = $cursor;
 461                  }
 462  
 463                  // we reached a premature end
 464                  if ($cursor === false) {
 465                      $cursor = $size;
 466                      $value_end = $cursor;
 467                  }
 468  
 469                  $value = substr($string, $value_begin, $value_end - $value_begin);
 470                  if ($value === false) $value = '';
 471                  $array[$key] = $this->parseData($value);
 472                  $cursor++;
 473  
 474              } else {
 475                  // boolattr
 476                  if ($key !== '') {
 477                      $array[$key] = $key;
 478                  } else {
 479                      // purely theoretical
 480                      if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
 481                  }
 482  
 483              }
 484          }
 485          return $array;
 486      }
 487  
 488  }
 489  
 490  // vim: et sw=4 sts=4
PHP Cross Reference of vtigercrm-6.1.0

/libraries/htmlpurifier/library/HTMLPurifier/Lexer/ -> DirectLex.php (source)