PHPXRef 0.7.1 : vtigercrm-6.1.0 : /libraries/htmlpurifier/library/HTMLPurifier/Lexer.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  
   3  /**
   4   * Forgivingly lexes HTML (SGML-style) markup into tokens.
   5   *
   6   * A lexer parses a string of SGML-style markup and converts them into
   7   * corresponding tokens.  It doesn't check for well-formedness, although its
   8   * internal mechanism may make this automatic (such as the case of
   9   * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
  10   * from.
  11   *
  12   * A lexer is HTML-oriented: it might work with XML, but it's not
  13   * recommended, as we adhere to a subset of the specification for optimization
  14   * reasons. This might change in the future. Also, most tokenizers are not
  15   * expected to handle DTDs or PIs.
  16   *
  17   * This class should not be directly instantiated, but you may use create() to
  18   * retrieve a default copy of the lexer.  Being a supertype, this class
  19   * does not actually define any implementation, but offers commonly used
  20   * convenience functions for subclasses.
  21   *
  22   * @note The unit tests will instantiate this class for testing purposes, as
  23   *       many of the utility functions require a class to be instantiated.
  24   *       This means that, even though this class is not runnable, it will
  25   *       not be declared abstract.
  26   *
  27   * @par
  28   *
  29   * @note
  30   * We use tokens rather than create a DOM representation because DOM would:
  31   *
  32   * @par
  33   *  -# Require more processing and memory to create,
  34   *  -# Is not streamable, and
  35   *  -# Has the entire document structure (html and body not needed).
  36   *
  37   * @par
  38   * However, DOM is helpful in that it makes it easy to move around nodes
  39   * without a lot of lookaheads to see when a tag is closed. This is a
  40   * limitation of the token system and some workarounds would be nice.
  41   */
  42  class HTMLPurifier_Lexer
  43  {
  44  
  45      /**
  46       * Whether or not this lexer implements line-number/column-number tracking.
  47       * If it does, set to true.
  48       */
  49      public $tracksLineNumbers = false;
  50  
  51      // -- STATIC ----------------------------------------------------------
  52  
  53      /**
  54       * Retrieves or sets the default Lexer as a Prototype Factory.
  55       *
  56       * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
  57       * a few exceptions involving special features that only DirectLex
  58       * implements.
  59       *
  60       * @note The behavior of this class has changed, rather than accepting
  61       *       a prototype object, it now accepts a configuration object.
  62       *       To specify your own prototype, set %Core.LexerImpl to it.
  63       *       This change in behavior de-singletonizes the lexer object.
  64       *
  65       * @param $config Instance of HTMLPurifier_Config
  66       * @return Concrete lexer.
  67       */
  68      public static function create($config) {
  69  
  70          if (!($config instanceof HTMLPurifier_Config)) {
  71              $lexer = $config;
  72              trigger_error("Passing a prototype to
  73                HTMLPurifier_Lexer::create() is deprecated, please instead
  74                use %Core.LexerImpl", E_USER_WARNING);
  75          } else {
  76              $lexer = $config->get('Core', 'LexerImpl');
  77          }
  78  
  79          $needs_tracking =
  80              $config->get('Core', 'MaintainLineNumbers') ||
  81              $config->get('Core', 'CollectErrors');
  82  
  83          $inst = null;
  84          if (is_object($lexer)) {
  85              $inst = $lexer;
  86          } else {
  87  
  88              if (is_null($lexer)) { do {
  89                  // auto-detection algorithm
  90  
  91                  if ($needs_tracking) {
  92                      $lexer = 'DirectLex';
  93                      break;
  94                  }
  95  
  96                  if (
  97                      class_exists('DOMDocument') &&
  98                      method_exists('DOMDocument', 'loadHTML') &&
  99                      !extension_loaded('domxml')
 100                  ) {
 101                      // check for DOM support, because while it's part of the
 102                      // core, it can be disabled compile time. Also, the PECL
 103                      // domxml extension overrides the default DOM, and is evil
 104                      // and nasty and we shan't bother to support it
 105                      $lexer = 'DOMLex';
 106                  } else {
 107                      $lexer = 'DirectLex';
 108                  }
 109  
 110              } while(0); } // do..while so we can break
 111  
 112              // instantiate recognized string names
 113              switch ($lexer) {
 114                  case 'DOMLex':
 115                      $inst = new HTMLPurifier_Lexer_DOMLex();
 116                      break;
 117                  case 'DirectLex':
 118                      $inst = new HTMLPurifier_Lexer_DirectLex();
 119                      break;
 120                  case 'PH5P':
 121                      $inst = new HTMLPurifier_Lexer_PH5P();
 122                      break;
 123                  default:
 124                      throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
 125              }
 126          }
 127  
 128          if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated');
 129  
 130          // once PHP DOM implements native line numbers, or we
 131          // hack out something using XSLT, remove this stipulation
 132          if ($needs_tracking && !$inst->tracksLineNumbers) {
 133              throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
 134          }
 135  
 136          return $inst;
 137  
 138      }
 139  
 140      // -- CONVENIENCE MEMBERS ---------------------------------------------
 141  
 142      public function __construct() {
 143          $this->_entity_parser = new HTMLPurifier_EntityParser();
 144      }
 145  
 146      /**
 147       * Most common entity to raw value conversion table for special entities.
 148       */
 149      protected $_special_entity2str =
 150              array(
 151                      '&quot;' => '"',
 152                      '&amp;'  => '&',
 153                      '&lt;'   => '<',
 154                      '&gt;'   => '>',
 155                      '&#39;'  => "'",
 156                      '&#039;' => "'",
 157                      '&#x27;' => "'"
 158              );
 159  
 160      /**
 161       * Parses special entities into the proper characters.
 162       *
 163       * This string will translate escaped versions of the special characters
 164       * into the correct ones.
 165       *
 166       * @warning
 167       * You should be able to treat the output of this function as
 168       * completely parsed, but that's only because all other entities should
 169       * have been handled previously in substituteNonSpecialEntities()
 170       *
 171       * @param $string String character data to be parsed.
 172       * @returns Parsed character data.
 173       */
 174      public function parseData($string) {
 175  
 176          // following functions require at least one character
 177          if ($string === '') return '';
 178  
 179          // subtracts amps that cannot possibly be escaped
 180          $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
 181              ($string[strlen($string)-1] === '&' ? 1 : 0);
 182  
 183          if (!$num_amp) return $string; // abort if no entities
 184          $num_esc_amp = substr_count($string, '&amp;');
 185          $string = strtr($string, $this->_special_entity2str);
 186  
 187          // code duplication for sake of optimization, see above
 188          $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
 189              ($string[strlen($string)-1] === '&' ? 1 : 0);
 190  
 191          if ($num_amp_2 <= $num_esc_amp) return $string;
 192  
 193          // hmm... now we have some uncommon entities. Use the callback.
 194          $string = $this->_entity_parser->substituteSpecialEntities($string);
 195          return $string;
 196      }
 197  
 198      /**
 199       * Lexes an HTML string into tokens.
 200       *
 201       * @param $string String HTML.
 202       * @return HTMLPurifier_Token array representation of HTML.
 203       */
 204      public function tokenizeHTML($string, $config, $context) {
 205          trigger_error('Call to abstract class', E_USER_ERROR);
 206      }
 207  
 208      /**
 209       * Translates CDATA sections into regular sections (through escaping).
 210       *
 211       * @param $string HTML string to process.
 212       * @returns HTML with CDATA sections escaped.
 213       */
 214      protected static function escapeCDATA($string) {
 215          return preg_replace_callback(
 216              '/<!\[CDATA\[(.+?)\]\]>/s',
 217              array('HTMLPurifier_Lexer', 'CDATACallback'),
 218              $string
 219          );
 220      }
 221  
 222      /**
 223       * Special CDATA case that is especially convoluted for <script>
 224       */
 225      protected static function escapeCommentedCDATA($string) {
 226          return preg_replace_callback(
 227              '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
 228              array('HTMLPurifier_Lexer', 'CDATACallback'),
 229              $string
 230          );
 231      }
 232  
 233      /**
 234       * Callback function for escapeCDATA() that does the work.
 235       *
 236       * @warning Though this is public in order to let the callback happen,
 237       *          calling it directly is not recommended.
 238       * @params $matches PCRE matches array, with index 0 the entire match
 239       *                  and 1 the inside of the CDATA section.
 240       * @returns Escaped internals of the CDATA section.
 241       */
 242      protected static function CDATACallback($matches) {
 243          // not exactly sure why the character set is needed, but whatever
 244          return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
 245      }
 246  
 247      /**
 248       * Takes a piece of HTML and normalizes it by converting entities, fixing
 249       * encoding, extracting bits, and other good stuff.
 250       * @todo Consider making protected
 251       */
 252      public function normalize($html, $config, $context) {
 253  
 254          // normalize newlines to \n
 255          $html = str_replace("\r\n", "\n", $html);
 256          $html = str_replace("\r", "\n", $html);
 257  
 258          if ($config->get('HTML', 'Trusted')) {
 259              // escape convoluted CDATA
 260              $html = $this->escapeCommentedCDATA($html);
 261          }
 262  
 263          // escape CDATA
 264          $html = $this->escapeCDATA($html);
 265  
 266          // extract body from document if applicable
 267          if ($config->get('Core', 'ConvertDocumentToFragment')) {
 268              $html = $this->extractBody($html);
 269          }
 270  
 271          // expand entities that aren't the big five
 272          $html = $this->_entity_parser->substituteNonSpecialEntities($html);
 273  
 274          // clean into wellformed UTF-8 string for an SGML context: this has
 275          // to be done after entity expansion because the entities sometimes
 276          // represent non-SGML characters (horror, horror!)
 277          $html = HTMLPurifier_Encoder::cleanUTF8($html);
 278  
 279          return $html;
 280      }
 281  
 282      /**
 283       * Takes a string of HTML (fragment or document) and returns the content
 284       * @todo Consider making protected
 285       */
 286      public function extractBody($html) {
 287          $matches = array();
 288          $result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches);
 289          if ($result) {
 290              return $matches[1];
 291          } else {
 292              return $html;
 293          }
 294      }
 295  
 296  }
 297  
 298  // vim: et sw=4 sts=4
PHP Cross Reference of vtigercrm-6.1.0

/libraries/htmlpurifier/library/HTMLPurifier/ -> Lexer.php (source)