MediaWiki
REL1_24
|
00001 <?php 00023 class HtmlFormatter { 00027 private $doc; 00028 00029 private $html; 00030 private $itemsToRemove = array(); 00031 private $elementsToFlatten = array(); 00032 protected $removeMedia = false; 00033 00039 public function __construct( $html ) { 00040 $this->html = $html; 00041 } 00042 00048 public static function wrapHTML( $html ) { 00049 return '<!doctype html><html><head></head><body>' . $html . '</body></html>'; 00050 } 00051 00057 protected function onHtmlReady( $html ) { 00058 return $html; 00059 } 00060 00064 public function getDoc() { 00065 if ( !$this->doc ) { 00066 $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); 00067 00068 // Workaround for bug that caused spaces before references 00069 // to disappear during processing: 00070 // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086 00071 // 00072 // Please replace with a better fix if one can be found. 00073 $html = str_replace( ' <', ' <', $html ); 00074 00075 libxml_use_internal_errors( true ); 00076 $loader = libxml_disable_entity_loader(); 00077 $this->doc = new DOMDocument(); 00078 $this->doc->strictErrorChecking = false; 00079 $this->doc->loadHTML( $html ); 00080 libxml_disable_entity_loader( $loader ); 00081 libxml_use_internal_errors( false ); 00082 $this->doc->encoding = 'UTF-8'; 00083 } 00084 return $this->doc; 00085 } 00086 00091 public function setRemoveMedia( $flag = true ) { 00092 $this->removeMedia = $flag; 00093 } 00094 00106 public function remove( $selectors ) { 00107 $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors ); 00108 } 00109 00119 public function flatten( $elements ) { 00120 $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements ); 00121 } 00122 00126 public function flattenAllTags() { 00127 $this->flatten( '[?!]?[a-z0-9]+' ); 00128 } 00129 00135 public function filterContent() { 00136 wfProfileIn( __METHOD__ ); 00137 $removals = $this->parseItemsToRemove(); 00138 00139 // Bail out early if nothing to do 00140 if ( array_reduce( $removals, 00141 function ( $carry, $item ) { 00142 return $carry && !$item; 00143 }, 00144 true 00145 ) ) { 00146 wfProfileOut( __METHOD__ ); 00147 return array(); 00148 } 00149 00150 $doc = $this->getDoc(); 00151 00152 // Remove tags 00153 00154 // You can't remove DOMNodes from a DOMNodeList as you're iterating 00155 // over them in a foreach loop. It will seemingly leave the internal 00156 // iterator on the foreach out of wack and results will be quite 00157 // strange. Though, making a queue of items to remove seems to work. 00158 $domElemsToRemove = array(); 00159 foreach ( $removals['TAG'] as $tagToRemove ) { 00160 $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove ); 00161 foreach ( $tagToRemoveNodes as $tagToRemoveNode ) { 00162 if ( $tagToRemoveNode ) { 00163 $domElemsToRemove[] = $tagToRemoveNode; 00164 } 00165 } 00166 } 00167 $removed = $this->removeElements( $domElemsToRemove ); 00168 00169 // Elements with named IDs 00170 $domElemsToRemove = array(); 00171 foreach ( $removals['ID'] as $itemToRemove ) { 00172 $itemToRemoveNode = $doc->getElementById( $itemToRemove ); 00173 if ( $itemToRemoveNode ) { 00174 $domElemsToRemove[] = $itemToRemoveNode; 00175 } 00176 } 00177 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); 00178 00179 // CSS Classes 00180 $domElemsToRemove = array(); 00181 $xpath = new DOMXpath( $doc ); 00182 foreach ( $removals['CLASS'] as $classToRemove ) { 00183 $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' ); 00184 00186 foreach ( $elements as $element ) { 00187 $classes = $element->getAttribute( 'class' ); 00188 if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) { 00189 $domElemsToRemove[] = $element; 00190 } 00191 } 00192 } 00193 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); 00194 00195 // Tags with CSS Classes 00196 foreach ( $removals['TAG_CLASS'] as $classToRemove ) { 00197 $parts = explode( '.', $classToRemove ); 00198 00199 $elements = $xpath->query( 00200 '//' . $parts[0] . '[@class="' . $parts[1] . '"]' 00201 ); 00202 $removed = array_merge( $removed, $this->removeElements( $elements ) ); 00203 } 00204 00205 wfProfileOut( __METHOD__ ); 00206 return $removed; 00207 } 00208 00214 private function removeElements( $elements ) { 00215 $list = $elements; 00216 if ( $elements instanceof DOMNodeList ) { 00217 $list = array(); 00218 foreach ( $elements as $element ) { 00219 $list[] = $element; 00220 } 00221 } 00223 foreach ( $list as $element ) { 00224 if ( $element->parentNode ) { 00225 $element->parentNode->removeChild( $element ); 00226 } 00227 } 00228 return $list; 00229 } 00230 00237 private function fixLibXML( $html ) { 00238 wfProfileIn( __METHOD__ ); 00239 static $replacements; 00240 if ( !$replacements ) { 00241 // We don't include rules like '"' => '&quot;' because entities had already been 00242 // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE! 00243 $replacements = new ReplacementArray( array( 00244 '"' => '&quot;', 00245 '&' => '&amp;', 00246 '<' => '&lt;', 00247 '>' => '&gt;', 00248 ) ); 00249 } 00250 $html = $replacements->replace( $html ); 00251 $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); 00252 wfProfileOut( __METHOD__ ); 00253 return $html; 00254 } 00255 00266 public function getText( $element = null ) { 00267 wfProfileIn( __METHOD__ ); 00268 00269 if ( $this->doc ) { 00270 wfProfileIn( __METHOD__ . '-dom' ); 00271 if ( $element !== null && !( $element instanceof DOMElement ) ) { 00272 $element = $this->doc->getElementById( $element ); 00273 } 00274 if ( $element ) { 00275 $body = $this->doc->getElementsByTagName( 'body' )->item( 0 ); 00276 $nodesArray = array(); 00277 foreach ( $body->childNodes as $node ) { 00278 $nodesArray[] = $node; 00279 } 00280 foreach ( $nodesArray as $nodeArray ) { 00281 $body->removeChild( $nodeArray ); 00282 } 00283 $body->appendChild( $element ); 00284 } 00285 $html = $this->doc->saveHTML(); 00286 wfProfileOut( __METHOD__ . '-dom' ); 00287 00288 wfProfileIn( __METHOD__ . '-fixes' ); 00289 $html = $this->fixLibXml( $html ); 00290 if ( wfIsWindows() ) { 00291 // Cleanup for CRLF misprocessing of unknown origin on Windows. 00292 // 00293 // If this error continues in the future, please track it down in the 00294 // XML code paths if possible and fix there. 00295 $html = str_replace( ' ', '', $html ); 00296 } 00297 wfProfileOut( __METHOD__ . '-fixes' ); 00298 } else { 00299 $html = $this->html; 00300 } 00301 // Remove stuff added by wrapHTML() 00302 $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html ); 00303 $html = $this->onHtmlReady( $html ); 00304 00305 wfProfileIn( __METHOD__ . '-flatten' ); 00306 if ( $this->elementsToFlatten ) { 00307 $elements = implode( '|', $this->elementsToFlatten ); 00308 $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html ); 00309 } 00310 wfProfileOut( __METHOD__ . '-flatten' ); 00311 00312 wfProfileOut( __METHOD__ ); 00313 return $html; 00314 } 00315 00326 protected function parseSelector( $selector, &$type, &$rawName ) { 00327 if ( strpos( $selector, '.' ) === 0 ) { 00328 $type = 'CLASS'; 00329 $rawName = substr( $selector, 1 ); 00330 } elseif ( strpos( $selector, '#' ) === 0 ) { 00331 $type = 'ID'; 00332 $rawName = substr( $selector, 1 ); 00333 } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) { 00334 $type = 'TAG_CLASS'; 00335 $rawName = $selector; 00336 } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) { 00337 $type = 'TAG'; 00338 $rawName = $selector; 00339 } else { 00340 throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" ); 00341 } 00342 00343 return true; 00344 } 00345 00351 protected function parseItemsToRemove() { 00352 wfProfileIn( __METHOD__ ); 00353 $removals = array( 00354 'ID' => array(), 00355 'TAG' => array(), 00356 'CLASS' => array(), 00357 'TAG_CLASS' => array(), 00358 ); 00359 00360 foreach ( $this->itemsToRemove as $itemToRemove ) { 00361 $type = ''; 00362 $rawName = ''; 00363 if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) { 00364 $removals[$type][] = $rawName; 00365 } 00366 } 00367 00368 if ( $this->removeMedia ) { 00369 $removals['TAG'][] = 'img'; 00370 $removals['TAG'][] = 'audio'; 00371 $removals['TAG'][] = 'video'; 00372 } 00373 00374 wfProfileOut( __METHOD__ ); 00375 return $removals; 00376 } 00377 }