[ Index ]

PHP Cross Reference of vtigercrm-6.1.0

title

Body

[close]

/libraries/htmlpurifier/library/HTMLPurifier/Lexer/ -> PEARSax3.php (source)

   1  <?php
   2  
   3  /**
   4   * Proof-of-concept lexer that uses the PEAR package XML_HTMLSax3 to parse HTML.
   5   *
   6   * PEAR, not suprisingly, also has a SAX parser for HTML.  I don't know
   7   * very much about implementation, but it's fairly well written.  However, that
   8   * abstraction comes at a price: performance. You need to have it installed,
   9   * and if the API changes, it might break our adapter. Not sure whether or not
  10   * it's UTF-8 aware, but it has some entity parsing trouble (in all areas,
  11   * text and attributes).
  12   *
  13   * Quite personally, I don't recommend using the PEAR class, and the defaults
  14   * don't use it. The unit tests do perform the tests on the SAX parser too, but
  15   * whatever it does for poorly formed HTML is up to it.
  16   *
  17   * @todo Generalize so that XML_HTMLSax is also supported.
  18   *
  19   * @warning Entity-resolution inside attributes is broken.
  20   */
  21  
  22  class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
  23  {
  24  
  25      /**
  26       * Internal accumulator array for SAX parsers.
  27       */
  28      protected $tokens = array();
  29  
  30      public function tokenizeHTML($string, $config, $context) {
  31  
  32          $this->tokens = array();
  33  
  34          $string = $this->normalize($string, $config, $context);
  35  
  36          $parser = new XML_HTMLSax3();
  37          $parser->set_object($this);
  38          $parser->set_element_handler('openHandler','closeHandler');
  39          $parser->set_data_handler('dataHandler');
  40          $parser->set_escape_handler('escapeHandler');
  41  
  42          // doesn't seem to work correctly for attributes
  43          $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
  44  
  45          $parser->parse($string);
  46  
  47          return $this->tokens;
  48  
  49      }
  50  
  51      /**
  52       * Open tag event handler, interface is defined by PEAR package.
  53       */
  54      public function openHandler(&$parser, $name, $attrs, $closed) {
  55          // entities are not resolved in attrs
  56          foreach ($attrs as $key => $attr) {
  57              $attrs[$key] = $this->parseData($attr);
  58          }
  59          if ($closed) {
  60              $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
  61          } else {
  62              $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
  63          }
  64          return true;
  65      }
  66  
  67      /**
  68       * Close tag event handler, interface is defined by PEAR package.
  69       */
  70      public function closeHandler(&$parser, $name) {
  71          // HTMLSax3 seems to always send empty tags an extra close tag
  72          // check and ignore if you see it:
  73          // [TESTME] to make sure it doesn't overreach
  74          if ($this->tokens[count($this->tokens)-1] instanceof HTMLPurifier_Token_Empty) {
  75              return true;
  76          }
  77          $this->tokens[] = new HTMLPurifier_Token_End($name);
  78          return true;
  79      }
  80  
  81      /**
  82       * Data event handler, interface is defined by PEAR package.
  83       */
  84      public function dataHandler(&$parser, $data) {
  85          $this->tokens[] = new HTMLPurifier_Token_Text($data);
  86          return true;
  87      }
  88  
  89      /**
  90       * Escaped text handler, interface is defined by PEAR package.
  91       */
  92      public function escapeHandler(&$parser, $data) {
  93          if (strpos($data, '--') === 0) {
  94              $this->tokens[] = new HTMLPurifier_Token_Comment($data);
  95          }
  96          // CDATA is handled elsewhere, but if it was handled here:
  97          //if (strpos($data, '[CDATA[') === 0) {
  98          //    $this->tokens[] = new HTMLPurifier_Token_Text(
  99          //        substr($data, 7, strlen($data) - 9) );
 100          //}
 101          return true;
 102      }
 103  
 104  }
 105  
 106  // vim: et sw=4 sts=4


Generated: Fri Nov 28 20:08:37 2014 Cross-referenced by PHPXref 0.7.1