[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/includes/ -> HtmlFormatter.php (source)

   1  <?php
   2  /**
   3   * Performs transformations of HTML by wrapping around libxml2 and working
   4   * around its countless bugs.
   5   *
   6   * This program is free software; you can redistribute it and/or modify
   7   * it under the terms of the GNU General Public License as published by
   8   * the Free Software Foundation; either version 2 of the License, or
   9   * (at your option) any later version.
  10   *
  11   * This program is distributed in the hope that it will be useful,
  12   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14   * GNU General Public License for more details.
  15   *
  16   * You should have received a copy of the GNU General Public License along
  17   * with this program; if not, write to the Free Software Foundation, Inc.,
  18   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  19   * http://www.gnu.org/copyleft/gpl.html
  20   *
  21   * @file
  22   */
  23  class HtmlFormatter {
  24      /**
  25       * @var DOMDocument
  26       */
  27      private $doc;
  28  
  29      private $html;
  30      private $itemsToRemove = array();
  31      private $elementsToFlatten = array();
  32      protected $removeMedia = false;
  33  
  34      /**
  35       * Constructor
  36       *
  37       * @param string $html Text to process
  38       */
  39  	public function __construct( $html ) {
  40          $this->html = $html;
  41      }
  42  
  43      /**
  44       * Turns a chunk of HTML into a proper document
  45       * @param string $html
  46       * @return string
  47       */
  48  	public static function wrapHTML( $html ) {
  49          return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
  50      }
  51  
  52      /**
  53       * Override this in descendant class to modify HTML after it has been converted from DOM tree
  54       * @param string $html HTML to process
  55       * @return string Processed HTML
  56       */
  57  	protected function onHtmlReady( $html ) {
  58          return $html;
  59      }
  60  
  61      /**
  62       * @return DOMDocument DOM to manipulate
  63       */
  64  	public function getDoc() {
  65          if ( !$this->doc ) {
  66              $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
  67  
  68              // Workaround for bug that caused spaces before references
  69              // to disappear during processing:
  70              // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086
  71              //
  72              // Please replace with a better fix if one can be found.
  73              $html = str_replace( ' <', '&#32;<', $html );
  74  
  75              libxml_use_internal_errors( true );
  76              $loader = libxml_disable_entity_loader();
  77              $this->doc = new DOMDocument();
  78              $this->doc->strictErrorChecking = false;
  79              $this->doc->loadHTML( $html );
  80              libxml_disable_entity_loader( $loader );
  81              libxml_use_internal_errors( false );
  82              $this->doc->encoding = 'UTF-8';
  83          }
  84          return $this->doc;
  85      }
  86  
  87      /**
  88       * Sets whether images/videos/sounds should be removed from output
  89       * @param bool $flag
  90       */
  91  	public function setRemoveMedia( $flag = true ) {
  92          $this->removeMedia = $flag;
  93      }
  94  
  95      /**
  96       * Adds one or more selector of content to remove. A subset of CSS selector
  97       * syntax is supported:
  98       *
  99       *   <tag>
 100       *   <tag>.class
 101       *   .<class>
 102       *   #<id>
 103       *
 104       * @param array|string $selectors Selector(s) of stuff to remove
 105       */
 106  	public function remove( $selectors ) {
 107          $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
 108      }
 109  
 110      /**
 111       * Adds one or more element name to the list to flatten (remove tag, but not its content)
 112       * Can accept undelimited regexes
 113       *
 114       * Note this interface may fail in surprising unexpected ways due to usage of regexes,
 115       * so should not be relied on for HTML markup security measures.
 116       *
 117       * @param array|string $elements Name(s) of tag(s) to flatten
 118       */
 119  	public function flatten( $elements ) {
 120          $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
 121      }
 122  
 123      /**
 124       * Instructs the formatter to flatten all tags
 125       */
 126  	public function flattenAllTags() {
 127          $this->flatten( '[?!]?[a-z0-9]+' );
 128      }
 129  
 130      /**
 131       * Removes content we've chosen to remove.  The text of the removed elements can be
 132       * extracted with the getText method.
 133       * @return array Array of removed DOMElements
 134       */
 135  	public function filterContent() {
 136          wfProfileIn( __METHOD__ );
 137          $removals = $this->parseItemsToRemove();
 138  
 139          // Bail out early if nothing to do
 140          if ( array_reduce( $removals,
 141              function ( $carry, $item ) {
 142                  return $carry && !$item;
 143              },
 144              true
 145          ) ) {
 146              wfProfileOut( __METHOD__ );
 147              return array();
 148          }
 149  
 150          $doc = $this->getDoc();
 151  
 152          // Remove tags
 153  
 154          // You can't remove DOMNodes from a DOMNodeList as you're iterating
 155          // over them in a foreach loop. It will seemingly leave the internal
 156          // iterator on the foreach out of wack and results will be quite
 157          // strange. Though, making a queue of items to remove seems to work.
 158          $domElemsToRemove = array();
 159          foreach ( $removals['TAG'] as $tagToRemove ) {
 160              $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
 161              foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
 162                  if ( $tagToRemoveNode ) {
 163                      $domElemsToRemove[] = $tagToRemoveNode;
 164                  }
 165              }
 166          }
 167          $removed = $this->removeElements( $domElemsToRemove );
 168  
 169          // Elements with named IDs
 170          $domElemsToRemove = array();
 171          foreach ( $removals['ID'] as $itemToRemove ) {
 172              $itemToRemoveNode = $doc->getElementById( $itemToRemove );
 173              if ( $itemToRemoveNode ) {
 174                  $domElemsToRemove[] = $itemToRemoveNode;
 175              }
 176          }
 177          $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
 178  
 179          // CSS Classes
 180          $domElemsToRemove = array();
 181          $xpath = new DOMXpath( $doc );
 182          foreach ( $removals['CLASS'] as $classToRemove ) {
 183              $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
 184  
 185              /** @var $element DOMElement */
 186              foreach ( $elements as $element ) {
 187                  $classes = $element->getAttribute( 'class' );
 188                  if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
 189                      $domElemsToRemove[] = $element;
 190                  }
 191              }
 192          }
 193          $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
 194  
 195          // Tags with CSS Classes
 196          foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
 197              $parts = explode( '.', $classToRemove );
 198  
 199              $elements = $xpath->query(
 200                  '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
 201              );
 202              $removed = array_merge( $removed, $this->removeElements( $elements ) );
 203          }
 204  
 205          wfProfileOut( __METHOD__ );
 206          return $removed;
 207      }
 208  
 209      /**
 210       * Removes a list of elelments from DOMDocument
 211       * @param array|DOMNodeList $elements
 212       * @return array Array of removed elements
 213       */
 214  	private function removeElements( $elements ) {
 215          $list = $elements;
 216          if ( $elements instanceof DOMNodeList ) {
 217              $list = array();
 218              foreach ( $elements as $element ) {
 219                  $list[] = $element;
 220              }
 221          }
 222          /** @var $element DOMElement */
 223          foreach ( $list as $element ) {
 224              if ( $element->parentNode ) {
 225                  $element->parentNode->removeChild( $element );
 226              }
 227          }
 228          return $list;
 229      }
 230  
 231      /**
 232       * libxml in its usual pointlessness converts many chars to entities - this function
 233       * perfoms a reverse conversion
 234       * @param string $html
 235       * @return string
 236       */
 237  	private function fixLibXML( $html ) {
 238          wfProfileIn( __METHOD__ );
 239          static $replacements;
 240          if ( !$replacements ) {
 241              // We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
 242              // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
 243              $replacements = new ReplacementArray( array(
 244                  '&quot;' => '&amp;quot;',
 245                  '&amp;' => '&amp;amp;',
 246                  '&lt;' => '&amp;lt;',
 247                  '&gt;' => '&amp;gt;',
 248              ) );
 249          }
 250          $html = $replacements->replace( $html );
 251          $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
 252          wfProfileOut( __METHOD__ );
 253          return $html;
 254      }
 255  
 256      /**
 257       * Performs final transformations and returns resulting HTML.  Note that if you want to call this
 258       * both without an element and with an element you should call it without an element first.  If you
 259       * specify the $element in the method it'll change the underlying dom and you won't be able to get
 260       * it back.
 261       *
 262       * @param DOMElement|string|null $element ID of element to get HTML from or
 263       *   false to get it from the whole tree
 264       * @return string Processed HTML
 265       */
 266  	public function getText( $element = null ) {
 267          wfProfileIn( __METHOD__ );
 268  
 269          if ( $this->doc ) {
 270              wfProfileIn( __METHOD__ . '-dom' );
 271              if ( $element !== null && !( $element instanceof DOMElement ) ) {
 272                  $element = $this->doc->getElementById( $element );
 273              }
 274              if ( $element ) {
 275                  $body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
 276                  $nodesArray = array();
 277                  foreach ( $body->childNodes as $node ) {
 278                      $nodesArray[] = $node;
 279                  }
 280                  foreach ( $nodesArray as $nodeArray ) {
 281                      $body->removeChild( $nodeArray );
 282                  }
 283                  $body->appendChild( $element );
 284              }
 285              $html = $this->doc->saveHTML();
 286              wfProfileOut( __METHOD__ . '-dom' );
 287  
 288              wfProfileIn( __METHOD__ . '-fixes' );
 289              $html = $this->fixLibXml( $html );
 290              if ( wfIsWindows() ) {
 291                  // Cleanup for CRLF misprocessing of unknown origin on Windows.
 292                  //
 293                  // If this error continues in the future, please track it down in the
 294                  // XML code paths if possible and fix there.
 295                  $html = str_replace( '&#13;', '', $html );
 296              }
 297              wfProfileOut( __METHOD__ . '-fixes' );
 298          } else {
 299              $html = $this->html;
 300          }
 301          // Remove stuff added by wrapHTML()
 302          $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
 303          $html = $this->onHtmlReady( $html );
 304  
 305          wfProfileIn( __METHOD__ . '-flatten' );
 306          if ( $this->elementsToFlatten ) {
 307              $elements = implode( '|', $this->elementsToFlatten );
 308              $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
 309          }
 310          wfProfileOut( __METHOD__ . '-flatten' );
 311  
 312          wfProfileOut( __METHOD__ );
 313          return $html;
 314      }
 315  
 316      /**
 317       * Helper function for parseItemsToRemove(). This function extracts the selector type
 318       * and the raw name of a selector from a CSS-style selector string and assigns those
 319       * values to parameters passed by reference. For example, if given '#toc' as the
 320       * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName.
 321       * @param string $selector CSS selector to parse
 322       * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
 323       * @param string $rawName The raw name of the selector
 324       * @return bool Whether the selector was successfully recognised
 325       */
 326  	protected function parseSelector( $selector, &$type, &$rawName ) {
 327          if ( strpos( $selector, '.' ) === 0 ) {
 328              $type = 'CLASS';
 329              $rawName = substr( $selector, 1 );
 330          } elseif ( strpos( $selector, '#' ) === 0 ) {
 331              $type = 'ID';
 332              $rawName = substr( $selector, 1 );
 333          } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) {
 334              $type = 'TAG_CLASS';
 335              $rawName = $selector;
 336          } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
 337              $type = 'TAG';
 338              $rawName = $selector;
 339          } else {
 340              throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" );
 341          }
 342  
 343          return true;
 344      }
 345  
 346      /**
 347       * Transforms CSS-style selectors into an internal representation suitable for
 348       * processing by filterContent()
 349       * @return array
 350       */
 351  	protected function parseItemsToRemove() {
 352          wfProfileIn( __METHOD__ );
 353          $removals = array(
 354              'ID' => array(),
 355              'TAG' => array(),
 356              'CLASS' => array(),
 357              'TAG_CLASS' => array(),
 358          );
 359  
 360          foreach ( $this->itemsToRemove as $itemToRemove ) {
 361              $type = '';
 362              $rawName = '';
 363              if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
 364                  $removals[$type][] = $rawName;
 365              }
 366          }
 367  
 368          if ( $this->removeMedia ) {
 369              $removals['TAG'][] = 'img';
 370              $removals['TAG'][] = 'audio';
 371              $removals['TAG'][] = 'video';
 372          }
 373  
 374          wfProfileOut( __METHOD__ );
 375          return $removals;
 376      }
 377  }


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1