MediaWiki  REL1_22
HtmlFormatter.php
Go to the documentation of this file.
00001 <?php
00023 class HtmlFormatter {
00027     private $doc;
00028 
00029     private $html;
00030     private $itemsToRemove = array();
00031     private $elementsToFlatten = array();
00032     protected $removeMedia = false;
00033 
00039     public function __construct( $html ) {
00040         $this->html = $html;
00041     }
00042 
00048     public static function wrapHTML( $html ) {
00049         return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
00050     }
00051 
00057     protected function onHtmlReady( $html ) {
00058         return $html;
00059     }
00060 
00064     public function getDoc() {
00065         if ( !$this->doc ) {
00066             $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
00067 
00068             // Workaround for bug that caused spaces before references
00069             // to disappear during processing:
00070             // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086
00071             //
00072             // Please replace with a better fix if one can be found.
00073             $html = str_replace( ' <', '&#32;<', $html );
00074 
00075             libxml_use_internal_errors( true );
00076             $loader = libxml_disable_entity_loader();
00077             $this->doc = new DOMDocument();
00078             $this->doc->strictErrorChecking = false;
00079             $this->doc->loadHTML( $html );
00080             libxml_disable_entity_loader( $loader );
00081             libxml_use_internal_errors( false );
00082             $this->doc->encoding = 'UTF-8';
00083         }
00084         return $this->doc;
00085     }
00086 
00091     public function setRemoveMedia( $flag = true ) {
00092         $this->removeMedia = $flag;
00093     }
00094 
00106     public function remove( $selectors ) {
00107         $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
00108     }
00109 
00119     public function flatten( $elements ) {
00120         $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
00121     }
00122 
00126     public function flattenAllTags() {
00127         $this->flatten( '[?!]?[a-z0-9]+' );
00128     }
00129 
00133     public function filterContent() {
00134         wfProfileIn( __METHOD__ );
00135         $removals = $this->parseItemsToRemove();
00136 
00137         if ( !$removals ) {
00138             wfProfileOut( __METHOD__ );
00139             return;
00140         }
00141 
00142         $doc = $this->getDoc();
00143 
00144         // Remove tags
00145 
00146         // You can't remove DOMNodes from a DOMNodeList as you're iterating
00147         // over them in a foreach loop. It will seemingly leave the internal
00148         // iterator on the foreach out of wack and results will be quite
00149         // strange. Though, making a queue of items to remove seems to work.
00150         $domElemsToRemove = array();
00151         foreach ( $removals['TAG'] as $tagToRemove ) {
00152             $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
00153             foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
00154                 if ( $tagToRemoveNode ) {
00155                     $domElemsToRemove[] = $tagToRemoveNode;
00156                 }
00157             }
00158         }
00159 
00160         $this->removeElements( $domElemsToRemove );
00161 
00162         // Elements with named IDs
00163         $domElemsToRemove = array();
00164         foreach ( $removals['ID'] as $itemToRemove ) {
00165             $itemToRemoveNode = $doc->getElementById( $itemToRemove );
00166             if ( $itemToRemoveNode ) {
00167                 $domElemsToRemove[] = $itemToRemoveNode;
00168             }
00169         }
00170         $this->removeElements( $domElemsToRemove );
00171 
00172         // CSS Classes
00173         $domElemsToRemove = array();
00174         $xpath = new DOMXpath( $doc );
00175         foreach ( $removals['CLASS'] as $classToRemove ) {
00176             $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
00177 
00179             foreach ( $elements as $element ) {
00180                 $classes = $element->getAttribute( 'class' );
00181                 if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
00182                     $domElemsToRemove[] = $element;
00183                 }
00184             }
00185         }
00186         $this->removeElements( $domElemsToRemove );
00187 
00188         // Tags with CSS Classes
00189         foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
00190             $parts = explode( '.', $classToRemove );
00191 
00192             $elements = $xpath->query(
00193                 '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
00194             );
00195 
00196             $this->removeElements( $elements );
00197         }
00198 
00199         wfProfileOut( __METHOD__ );
00200     }
00201 
00206     private function removeElements( $elements ) {
00207         $list = $elements;
00208         if ( $elements instanceof DOMNodeList ) {
00209             $list = array();
00210             foreach ( $elements as $element ) {
00211                 $list[] = $element;
00212             }
00213         }
00215         foreach ( $list as $element ) {
00216             if ( $element->parentNode ) {
00217                 $element->parentNode->removeChild( $element );
00218             }
00219         }
00220     }
00221 
00228     private function fixLibXML( $html ) {
00229         wfProfileIn( __METHOD__ );
00230         static $replacements;
00231         if ( ! $replacements ) {
00232             // We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
00233             // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
00234             $replacements = new ReplacementArray( array(
00235                 '&quot;' => '&amp;quot;',
00236                 '&amp;' => '&amp;amp;',
00237                 '&lt;' => '&amp;lt;',
00238                 '&gt;' => '&amp;gt;',
00239             ) );
00240         }
00241         $html = $replacements->replace( $html );
00242         $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
00243         wfProfileOut( __METHOD__ );
00244         return $html;
00245     }
00246 
00253     public function getText( $element = null ) {
00254         wfProfileIn( __METHOD__ );
00255 
00256         if ( $this->doc ) {
00257             if ( $element !== null && !( $element instanceof DOMElement ) ) {
00258                 $element = $this->doc->getElementById( $element );
00259             }
00260             if ( $element ) {
00261                 $body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
00262                 $nodesArray = array();
00263                 foreach ( $body->childNodes as $node ) {
00264                     $nodesArray[] = $node;
00265                 }
00266                 foreach ( $nodesArray as $nodeArray ) {
00267                     $body->removeChild( $nodeArray );
00268                 }
00269                 $body->appendChild( $element );
00270             }
00271             $html = $this->doc->saveHTML();
00272             $html = $this->fixLibXml( $html );
00273         } else {
00274             $html = $this->html;
00275         }
00276         if ( wfIsWindows() ) {
00277             // Appears to be cleanup for CRLF misprocessing of unknown origin
00278             // when running server on Windows platform.
00279             //
00280             // If this error continues in the future, please track it down in the
00281             // XML code paths if possible and fix there.
00282             $html = str_replace( '&#13;', '', $html );
00283         }
00284         $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
00285         $html = $this->onHtmlReady( $html );
00286 
00287         if ( $this->elementsToFlatten ) {
00288             $elements = implode( '|', $this->elementsToFlatten );
00289             $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
00290         }
00291 
00292         wfProfileOut( __METHOD__ );
00293         return $html;
00294     }
00295 
00302     protected function parseSelector( $selector, &$type, &$rawName ) {
00303         if ( strpos( $selector, '.' ) === 0 ) {
00304             $type = 'CLASS';
00305             $rawName = substr( $selector, 1 );
00306         } elseif ( strpos( $selector, '#' ) === 0 ) {
00307             $type = 'ID';
00308             $rawName = substr( $selector, 1 );
00309         } elseif ( strpos( $selector, '.' ) !== 0 &&
00310             strpos( $selector, '.' ) !== false )
00311         {
00312             $type = 'TAG_CLASS';
00313             $rawName = $selector;
00314         } elseif ( strpos( $selector, '[' ) === false
00315             && strpos( $selector, ']' ) === false )
00316         {
00317             $type = 'TAG';
00318             $rawName = $selector;
00319         } else {
00320             throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" );
00321         }
00322 
00323         return true;
00324     }
00325 
00330     protected function parseItemsToRemove() {
00331         wfProfileIn( __METHOD__ );
00332         $removals = array(
00333             'ID' => array(),
00334             'TAG' => array(),
00335             'CLASS' => array(),
00336             'TAG_CLASS' => array(),
00337         );
00338 
00339         foreach ( $this->itemsToRemove as $itemToRemove ) {
00340             $type = '';
00341             $rawName = '';
00342             if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
00343                 $removals[$type][] = $rawName;
00344             }
00345         }
00346 
00347         if ( $this->removeMedia ) {
00348             $removals['TAG'][] = 'img';
00349             $removals['TAG'][] = 'audio';
00350             $removals['TAG'][] = 'video';
00351         }
00352 
00353         wfProfileOut( __METHOD__ );
00354         return $removals;
00355     }
00356 }