[ Index ] |
PHP Cross Reference of MediaWiki-1.24.0 |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Performs transformations of HTML by wrapping around libxml2 and working 4 * around its countless bugs. 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License along 17 * with this program; if not, write to the Free Software Foundation, Inc., 18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 19 * http://www.gnu.org/copyleft/gpl.html 20 * 21 * @file 22 */ 23 class HtmlFormatter { 24 /** 25 * @var DOMDocument 26 */ 27 private $doc; 28 29 private $html; 30 private $itemsToRemove = array(); 31 private $elementsToFlatten = array(); 32 protected $removeMedia = false; 33 34 /** 35 * Constructor 36 * 37 * @param string $html Text to process 38 */ 39 public function __construct( $html ) { 40 $this->html = $html; 41 } 42 43 /** 44 * Turns a chunk of HTML into a proper document 45 * @param string $html 46 * @return string 47 */ 48 public static function wrapHTML( $html ) { 49 return '<!doctype html><html><head></head><body>' . $html . '</body></html>'; 50 } 51 52 /** 53 * Override this in descendant class to modify HTML after it has been converted from DOM tree 54 * @param string $html HTML to process 55 * @return string Processed HTML 56 */ 57 protected function onHtmlReady( $html ) { 58 return $html; 59 } 60 61 /** 62 * @return DOMDocument DOM to manipulate 63 */ 64 public function getDoc() { 65 if ( !$this->doc ) { 66 $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); 67 68 // Workaround for bug that caused spaces before references 69 // to disappear during processing: 70 // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086 71 // 72 // Please replace with a better fix if one can be found. 73 $html = str_replace( ' <', ' <', $html ); 74 75 libxml_use_internal_errors( true ); 76 $loader = libxml_disable_entity_loader(); 77 $this->doc = new DOMDocument(); 78 $this->doc->strictErrorChecking = false; 79 $this->doc->loadHTML( $html ); 80 libxml_disable_entity_loader( $loader ); 81 libxml_use_internal_errors( false ); 82 $this->doc->encoding = 'UTF-8'; 83 } 84 return $this->doc; 85 } 86 87 /** 88 * Sets whether images/videos/sounds should be removed from output 89 * @param bool $flag 90 */ 91 public function setRemoveMedia( $flag = true ) { 92 $this->removeMedia = $flag; 93 } 94 95 /** 96 * Adds one or more selector of content to remove. A subset of CSS selector 97 * syntax is supported: 98 * 99 * <tag> 100 * <tag>.class 101 * .<class> 102 * #<id> 103 * 104 * @param array|string $selectors Selector(s) of stuff to remove 105 */ 106 public function remove( $selectors ) { 107 $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors ); 108 } 109 110 /** 111 * Adds one or more element name to the list to flatten (remove tag, but not its content) 112 * Can accept undelimited regexes 113 * 114 * Note this interface may fail in surprising unexpected ways due to usage of regexes, 115 * so should not be relied on for HTML markup security measures. 116 * 117 * @param array|string $elements Name(s) of tag(s) to flatten 118 */ 119 public function flatten( $elements ) { 120 $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements ); 121 } 122 123 /** 124 * Instructs the formatter to flatten all tags 125 */ 126 public function flattenAllTags() { 127 $this->flatten( '[?!]?[a-z0-9]+' ); 128 } 129 130 /** 131 * Removes content we've chosen to remove. The text of the removed elements can be 132 * extracted with the getText method. 133 * @return array Array of removed DOMElements 134 */ 135 public function filterContent() { 136 wfProfileIn( __METHOD__ ); 137 $removals = $this->parseItemsToRemove(); 138 139 // Bail out early if nothing to do 140 if ( array_reduce( $removals, 141 function ( $carry, $item ) { 142 return $carry && !$item; 143 }, 144 true 145 ) ) { 146 wfProfileOut( __METHOD__ ); 147 return array(); 148 } 149 150 $doc = $this->getDoc(); 151 152 // Remove tags 153 154 // You can't remove DOMNodes from a DOMNodeList as you're iterating 155 // over them in a foreach loop. It will seemingly leave the internal 156 // iterator on the foreach out of wack and results will be quite 157 // strange. Though, making a queue of items to remove seems to work. 158 $domElemsToRemove = array(); 159 foreach ( $removals['TAG'] as $tagToRemove ) { 160 $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove ); 161 foreach ( $tagToRemoveNodes as $tagToRemoveNode ) { 162 if ( $tagToRemoveNode ) { 163 $domElemsToRemove[] = $tagToRemoveNode; 164 } 165 } 166 } 167 $removed = $this->removeElements( $domElemsToRemove ); 168 169 // Elements with named IDs 170 $domElemsToRemove = array(); 171 foreach ( $removals['ID'] as $itemToRemove ) { 172 $itemToRemoveNode = $doc->getElementById( $itemToRemove ); 173 if ( $itemToRemoveNode ) { 174 $domElemsToRemove[] = $itemToRemoveNode; 175 } 176 } 177 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); 178 179 // CSS Classes 180 $domElemsToRemove = array(); 181 $xpath = new DOMXpath( $doc ); 182 foreach ( $removals['CLASS'] as $classToRemove ) { 183 $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' ); 184 185 /** @var $element DOMElement */ 186 foreach ( $elements as $element ) { 187 $classes = $element->getAttribute( 'class' ); 188 if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) { 189 $domElemsToRemove[] = $element; 190 } 191 } 192 } 193 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); 194 195 // Tags with CSS Classes 196 foreach ( $removals['TAG_CLASS'] as $classToRemove ) { 197 $parts = explode( '.', $classToRemove ); 198 199 $elements = $xpath->query( 200 '//' . $parts[0] . '[@class="' . $parts[1] . '"]' 201 ); 202 $removed = array_merge( $removed, $this->removeElements( $elements ) ); 203 } 204 205 wfProfileOut( __METHOD__ ); 206 return $removed; 207 } 208 209 /** 210 * Removes a list of elelments from DOMDocument 211 * @param array|DOMNodeList $elements 212 * @return array Array of removed elements 213 */ 214 private function removeElements( $elements ) { 215 $list = $elements; 216 if ( $elements instanceof DOMNodeList ) { 217 $list = array(); 218 foreach ( $elements as $element ) { 219 $list[] = $element; 220 } 221 } 222 /** @var $element DOMElement */ 223 foreach ( $list as $element ) { 224 if ( $element->parentNode ) { 225 $element->parentNode->removeChild( $element ); 226 } 227 } 228 return $list; 229 } 230 231 /** 232 * libxml in its usual pointlessness converts many chars to entities - this function 233 * perfoms a reverse conversion 234 * @param string $html 235 * @return string 236 */ 237 private function fixLibXML( $html ) { 238 wfProfileIn( __METHOD__ ); 239 static $replacements; 240 if ( !$replacements ) { 241 // We don't include rules like '"' => '&quot;' because entities had already been 242 // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE! 243 $replacements = new ReplacementArray( array( 244 '"' => '&quot;', 245 '&' => '&amp;', 246 '<' => '&lt;', 247 '>' => '&gt;', 248 ) ); 249 } 250 $html = $replacements->replace( $html ); 251 $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); 252 wfProfileOut( __METHOD__ ); 253 return $html; 254 } 255 256 /** 257 * Performs final transformations and returns resulting HTML. Note that if you want to call this 258 * both without an element and with an element you should call it without an element first. If you 259 * specify the $element in the method it'll change the underlying dom and you won't be able to get 260 * it back. 261 * 262 * @param DOMElement|string|null $element ID of element to get HTML from or 263 * false to get it from the whole tree 264 * @return string Processed HTML 265 */ 266 public function getText( $element = null ) { 267 wfProfileIn( __METHOD__ ); 268 269 if ( $this->doc ) { 270 wfProfileIn( __METHOD__ . '-dom' ); 271 if ( $element !== null && !( $element instanceof DOMElement ) ) { 272 $element = $this->doc->getElementById( $element ); 273 } 274 if ( $element ) { 275 $body = $this->doc->getElementsByTagName( 'body' )->item( 0 ); 276 $nodesArray = array(); 277 foreach ( $body->childNodes as $node ) { 278 $nodesArray[] = $node; 279 } 280 foreach ( $nodesArray as $nodeArray ) { 281 $body->removeChild( $nodeArray ); 282 } 283 $body->appendChild( $element ); 284 } 285 $html = $this->doc->saveHTML(); 286 wfProfileOut( __METHOD__ . '-dom' ); 287 288 wfProfileIn( __METHOD__ . '-fixes' ); 289 $html = $this->fixLibXml( $html ); 290 if ( wfIsWindows() ) { 291 // Cleanup for CRLF misprocessing of unknown origin on Windows. 292 // 293 // If this error continues in the future, please track it down in the 294 // XML code paths if possible and fix there. 295 $html = str_replace( ' ', '', $html ); 296 } 297 wfProfileOut( __METHOD__ . '-fixes' ); 298 } else { 299 $html = $this->html; 300 } 301 // Remove stuff added by wrapHTML() 302 $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html ); 303 $html = $this->onHtmlReady( $html ); 304 305 wfProfileIn( __METHOD__ . '-flatten' ); 306 if ( $this->elementsToFlatten ) { 307 $elements = implode( '|', $this->elementsToFlatten ); 308 $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html ); 309 } 310 wfProfileOut( __METHOD__ . '-flatten' ); 311 312 wfProfileOut( __METHOD__ ); 313 return $html; 314 } 315 316 /** 317 * Helper function for parseItemsToRemove(). This function extracts the selector type 318 * and the raw name of a selector from a CSS-style selector string and assigns those 319 * values to parameters passed by reference. For example, if given '#toc' as the 320 * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName. 321 * @param string $selector CSS selector to parse 322 * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG) 323 * @param string $rawName The raw name of the selector 324 * @return bool Whether the selector was successfully recognised 325 */ 326 protected function parseSelector( $selector, &$type, &$rawName ) { 327 if ( strpos( $selector, '.' ) === 0 ) { 328 $type = 'CLASS'; 329 $rawName = substr( $selector, 1 ); 330 } elseif ( strpos( $selector, '#' ) === 0 ) { 331 $type = 'ID'; 332 $rawName = substr( $selector, 1 ); 333 } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) { 334 $type = 'TAG_CLASS'; 335 $rawName = $selector; 336 } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) { 337 $type = 'TAG'; 338 $rawName = $selector; 339 } else { 340 throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" ); 341 } 342 343 return true; 344 } 345 346 /** 347 * Transforms CSS-style selectors into an internal representation suitable for 348 * processing by filterContent() 349 * @return array 350 */ 351 protected function parseItemsToRemove() { 352 wfProfileIn( __METHOD__ ); 353 $removals = array( 354 'ID' => array(), 355 'TAG' => array(), 356 'CLASS' => array(), 357 'TAG_CLASS' => array(), 358 ); 359 360 foreach ( $this->itemsToRemove as $itemToRemove ) { 361 $type = ''; 362 $rawName = ''; 363 if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) { 364 $removals[$type][] = $rawName; 365 } 366 } 367 368 if ( $this->removeMedia ) { 369 $removals['TAG'][] = 'img'; 370 $removals['TAG'][] = 'audio'; 371 $removals['TAG'][] = 'video'; 372 } 373 374 wfProfileOut( __METHOD__ ); 375 return $removals; 376 } 377 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Fri Nov 28 14:03:12 2014 | Cross-referenced by PHPXref 0.7.1 |