MediaWiki  REL1_24
HtmlFormatter.php
Go to the documentation of this file.
00001 <?php
00023 class HtmlFormatter {
00027     private $doc;
00028 
00029     private $html;
00030     private $itemsToRemove = array();
00031     private $elementsToFlatten = array();
00032     protected $removeMedia = false;
00033 
00039     public function __construct( $html ) {
00040         $this->html = $html;
00041     }
00042 
00048     public static function wrapHTML( $html ) {
00049         return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
00050     }
00051 
00057     protected function onHtmlReady( $html ) {
00058         return $html;
00059     }
00060 
00064     public function getDoc() {
00065         if ( !$this->doc ) {
00066             $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
00067 
00068             // Workaround for bug that caused spaces before references
00069             // to disappear during processing:
00070             // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086
00071             //
00072             // Please replace with a better fix if one can be found.
00073             $html = str_replace( ' <', '&#32;<', $html );
00074 
00075             libxml_use_internal_errors( true );
00076             $loader = libxml_disable_entity_loader();
00077             $this->doc = new DOMDocument();
00078             $this->doc->strictErrorChecking = false;
00079             $this->doc->loadHTML( $html );
00080             libxml_disable_entity_loader( $loader );
00081             libxml_use_internal_errors( false );
00082             $this->doc->encoding = 'UTF-8';
00083         }
00084         return $this->doc;
00085     }
00086 
00091     public function setRemoveMedia( $flag = true ) {
00092         $this->removeMedia = $flag;
00093     }
00094 
00106     public function remove( $selectors ) {
00107         $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
00108     }
00109 
00119     public function flatten( $elements ) {
00120         $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
00121     }
00122 
00126     public function flattenAllTags() {
00127         $this->flatten( '[?!]?[a-z0-9]+' );
00128     }
00129 
00135     public function filterContent() {
00136         wfProfileIn( __METHOD__ );
00137         $removals = $this->parseItemsToRemove();
00138 
00139         // Bail out early if nothing to do
00140         if ( array_reduce( $removals,
00141             function ( $carry, $item ) {
00142                 return $carry && !$item;
00143             },
00144             true
00145         ) ) {
00146             wfProfileOut( __METHOD__ );
00147             return array();
00148         }
00149 
00150         $doc = $this->getDoc();
00151 
00152         // Remove tags
00153 
00154         // You can't remove DOMNodes from a DOMNodeList as you're iterating
00155         // over them in a foreach loop. It will seemingly leave the internal
00156         // iterator on the foreach out of wack and results will be quite
00157         // strange. Though, making a queue of items to remove seems to work.
00158         $domElemsToRemove = array();
00159         foreach ( $removals['TAG'] as $tagToRemove ) {
00160             $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
00161             foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
00162                 if ( $tagToRemoveNode ) {
00163                     $domElemsToRemove[] = $tagToRemoveNode;
00164                 }
00165             }
00166         }
00167         $removed = $this->removeElements( $domElemsToRemove );
00168 
00169         // Elements with named IDs
00170         $domElemsToRemove = array();
00171         foreach ( $removals['ID'] as $itemToRemove ) {
00172             $itemToRemoveNode = $doc->getElementById( $itemToRemove );
00173             if ( $itemToRemoveNode ) {
00174                 $domElemsToRemove[] = $itemToRemoveNode;
00175             }
00176         }
00177         $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
00178 
00179         // CSS Classes
00180         $domElemsToRemove = array();
00181         $xpath = new DOMXpath( $doc );
00182         foreach ( $removals['CLASS'] as $classToRemove ) {
00183             $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
00184 
00186             foreach ( $elements as $element ) {
00187                 $classes = $element->getAttribute( 'class' );
00188                 if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
00189                     $domElemsToRemove[] = $element;
00190                 }
00191             }
00192         }
00193         $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
00194 
00195         // Tags with CSS Classes
00196         foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
00197             $parts = explode( '.', $classToRemove );
00198 
00199             $elements = $xpath->query(
00200                 '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
00201             );
00202             $removed = array_merge( $removed, $this->removeElements( $elements ) );
00203         }
00204 
00205         wfProfileOut( __METHOD__ );
00206         return $removed;
00207     }
00208 
00214     private function removeElements( $elements ) {
00215         $list = $elements;
00216         if ( $elements instanceof DOMNodeList ) {
00217             $list = array();
00218             foreach ( $elements as $element ) {
00219                 $list[] = $element;
00220             }
00221         }
00223         foreach ( $list as $element ) {
00224             if ( $element->parentNode ) {
00225                 $element->parentNode->removeChild( $element );
00226             }
00227         }
00228         return $list;
00229     }
00230 
00237     private function fixLibXML( $html ) {
00238         wfProfileIn( __METHOD__ );
00239         static $replacements;
00240         if ( !$replacements ) {
00241             // We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
00242             // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
00243             $replacements = new ReplacementArray( array(
00244                 '&quot;' => '&amp;quot;',
00245                 '&amp;' => '&amp;amp;',
00246                 '&lt;' => '&amp;lt;',
00247                 '&gt;' => '&amp;gt;',
00248             ) );
00249         }
00250         $html = $replacements->replace( $html );
00251         $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
00252         wfProfileOut( __METHOD__ );
00253         return $html;
00254     }
00255 
00266     public function getText( $element = null ) {
00267         wfProfileIn( __METHOD__ );
00268 
00269         if ( $this->doc ) {
00270             wfProfileIn( __METHOD__ . '-dom' );
00271             if ( $element !== null && !( $element instanceof DOMElement ) ) {
00272                 $element = $this->doc->getElementById( $element );
00273             }
00274             if ( $element ) {
00275                 $body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
00276                 $nodesArray = array();
00277                 foreach ( $body->childNodes as $node ) {
00278                     $nodesArray[] = $node;
00279                 }
00280                 foreach ( $nodesArray as $nodeArray ) {
00281                     $body->removeChild( $nodeArray );
00282                 }
00283                 $body->appendChild( $element );
00284             }
00285             $html = $this->doc->saveHTML();
00286             wfProfileOut( __METHOD__ . '-dom' );
00287 
00288             wfProfileIn( __METHOD__ . '-fixes' );
00289             $html = $this->fixLibXml( $html );
00290             if ( wfIsWindows() ) {
00291                 // Cleanup for CRLF misprocessing of unknown origin on Windows.
00292                 //
00293                 // If this error continues in the future, please track it down in the
00294                 // XML code paths if possible and fix there.
00295                 $html = str_replace( '&#13;', '', $html );
00296             }
00297             wfProfileOut( __METHOD__ . '-fixes' );
00298         } else {
00299             $html = $this->html;
00300         }
00301         // Remove stuff added by wrapHTML()
00302         $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
00303         $html = $this->onHtmlReady( $html );
00304 
00305         wfProfileIn( __METHOD__ . '-flatten' );
00306         if ( $this->elementsToFlatten ) {
00307             $elements = implode( '|', $this->elementsToFlatten );
00308             $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
00309         }
00310         wfProfileOut( __METHOD__ . '-flatten' );
00311 
00312         wfProfileOut( __METHOD__ );
00313         return $html;
00314     }
00315 
00326     protected function parseSelector( $selector, &$type, &$rawName ) {
00327         if ( strpos( $selector, '.' ) === 0 ) {
00328             $type = 'CLASS';
00329             $rawName = substr( $selector, 1 );
00330         } elseif ( strpos( $selector, '#' ) === 0 ) {
00331             $type = 'ID';
00332             $rawName = substr( $selector, 1 );
00333         } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) {
00334             $type = 'TAG_CLASS';
00335             $rawName = $selector;
00336         } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
00337             $type = 'TAG';
00338             $rawName = $selector;
00339         } else {
00340             throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" );
00341         }
00342 
00343         return true;
00344     }
00345 
00351     protected function parseItemsToRemove() {
00352         wfProfileIn( __METHOD__ );
00353         $removals = array(
00354             'ID' => array(),
00355             'TAG' => array(),
00356             'CLASS' => array(),
00357             'TAG_CLASS' => array(),
00358         );
00359 
00360         foreach ( $this->itemsToRemove as $itemToRemove ) {
00361             $type = '';
00362             $rawName = '';
00363             if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
00364                 $removals[$type][] = $rawName;
00365             }
00366         }
00367 
00368         if ( $this->removeMedia ) {
00369             $removals['TAG'][] = 'img';
00370             $removals['TAG'][] = 'audio';
00371             $removals['TAG'][] = 'video';
00372         }
00373 
00374         wfProfileOut( __METHOD__ );
00375         return $removals;
00376     }
00377 }