MediaWiki
REL1_22
|
00001 <?php 00023 class HtmlFormatter { 00027 private $doc; 00028 00029 private $html; 00030 private $itemsToRemove = array(); 00031 private $elementsToFlatten = array(); 00032 protected $removeMedia = false; 00033 00039 public function __construct( $html ) { 00040 $this->html = $html; 00041 } 00042 00048 public static function wrapHTML( $html ) { 00049 return '<!doctype html><html><head></head><body>' . $html . '</body></html>'; 00050 } 00051 00057 protected function onHtmlReady( $html ) { 00058 return $html; 00059 } 00060 00064 public function getDoc() { 00065 if ( !$this->doc ) { 00066 $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); 00067 00068 // Workaround for bug that caused spaces before references 00069 // to disappear during processing: 00070 // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086 00071 // 00072 // Please replace with a better fix if one can be found. 00073 $html = str_replace( ' <', ' <', $html ); 00074 00075 libxml_use_internal_errors( true ); 00076 $loader = libxml_disable_entity_loader(); 00077 $this->doc = new DOMDocument(); 00078 $this->doc->strictErrorChecking = false; 00079 $this->doc->loadHTML( $html ); 00080 libxml_disable_entity_loader( $loader ); 00081 libxml_use_internal_errors( false ); 00082 $this->doc->encoding = 'UTF-8'; 00083 } 00084 return $this->doc; 00085 } 00086 00091 public function setRemoveMedia( $flag = true ) { 00092 $this->removeMedia = $flag; 00093 } 00094 00106 public function remove( $selectors ) { 00107 $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors ); 00108 } 00109 00119 public function flatten( $elements ) { 00120 $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements ); 00121 } 00122 00126 public function flattenAllTags() { 00127 $this->flatten( '[?!]?[a-z0-9]+' ); 00128 } 00129 00133 public function filterContent() { 00134 wfProfileIn( __METHOD__ ); 00135 $removals = $this->parseItemsToRemove(); 00136 00137 if ( !$removals ) { 00138 wfProfileOut( __METHOD__ ); 00139 return; 00140 } 00141 00142 $doc = $this->getDoc(); 00143 00144 // Remove tags 00145 00146 // You can't remove DOMNodes from a DOMNodeList as you're iterating 00147 // over them in a foreach loop. It will seemingly leave the internal 00148 // iterator on the foreach out of wack and results will be quite 00149 // strange. Though, making a queue of items to remove seems to work. 00150 $domElemsToRemove = array(); 00151 foreach ( $removals['TAG'] as $tagToRemove ) { 00152 $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove ); 00153 foreach ( $tagToRemoveNodes as $tagToRemoveNode ) { 00154 if ( $tagToRemoveNode ) { 00155 $domElemsToRemove[] = $tagToRemoveNode; 00156 } 00157 } 00158 } 00159 00160 $this->removeElements( $domElemsToRemove ); 00161 00162 // Elements with named IDs 00163 $domElemsToRemove = array(); 00164 foreach ( $removals['ID'] as $itemToRemove ) { 00165 $itemToRemoveNode = $doc->getElementById( $itemToRemove ); 00166 if ( $itemToRemoveNode ) { 00167 $domElemsToRemove[] = $itemToRemoveNode; 00168 } 00169 } 00170 $this->removeElements( $domElemsToRemove ); 00171 00172 // CSS Classes 00173 $domElemsToRemove = array(); 00174 $xpath = new DOMXpath( $doc ); 00175 foreach ( $removals['CLASS'] as $classToRemove ) { 00176 $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' ); 00177 00179 foreach ( $elements as $element ) { 00180 $classes = $element->getAttribute( 'class' ); 00181 if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) { 00182 $domElemsToRemove[] = $element; 00183 } 00184 } 00185 } 00186 $this->removeElements( $domElemsToRemove ); 00187 00188 // Tags with CSS Classes 00189 foreach ( $removals['TAG_CLASS'] as $classToRemove ) { 00190 $parts = explode( '.', $classToRemove ); 00191 00192 $elements = $xpath->query( 00193 '//' . $parts[0] . '[@class="' . $parts[1] . '"]' 00194 ); 00195 00196 $this->removeElements( $elements ); 00197 } 00198 00199 wfProfileOut( __METHOD__ ); 00200 } 00201 00206 private function removeElements( $elements ) { 00207 $list = $elements; 00208 if ( $elements instanceof DOMNodeList ) { 00209 $list = array(); 00210 foreach ( $elements as $element ) { 00211 $list[] = $element; 00212 } 00213 } 00215 foreach ( $list as $element ) { 00216 if ( $element->parentNode ) { 00217 $element->parentNode->removeChild( $element ); 00218 } 00219 } 00220 } 00221 00228 private function fixLibXML( $html ) { 00229 wfProfileIn( __METHOD__ ); 00230 static $replacements; 00231 if ( ! $replacements ) { 00232 // We don't include rules like '"' => '&quot;' because entities had already been 00233 // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE! 00234 $replacements = new ReplacementArray( array( 00235 '"' => '&quot;', 00236 '&' => '&amp;', 00237 '<' => '&lt;', 00238 '>' => '&gt;', 00239 ) ); 00240 } 00241 $html = $replacements->replace( $html ); 00242 $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); 00243 wfProfileOut( __METHOD__ ); 00244 return $html; 00245 } 00246 00253 public function getText( $element = null ) { 00254 wfProfileIn( __METHOD__ ); 00255 00256 if ( $this->doc ) { 00257 if ( $element !== null && !( $element instanceof DOMElement ) ) { 00258 $element = $this->doc->getElementById( $element ); 00259 } 00260 if ( $element ) { 00261 $body = $this->doc->getElementsByTagName( 'body' )->item( 0 ); 00262 $nodesArray = array(); 00263 foreach ( $body->childNodes as $node ) { 00264 $nodesArray[] = $node; 00265 } 00266 foreach ( $nodesArray as $nodeArray ) { 00267 $body->removeChild( $nodeArray ); 00268 } 00269 $body->appendChild( $element ); 00270 } 00271 $html = $this->doc->saveHTML(); 00272 $html = $this->fixLibXml( $html ); 00273 } else { 00274 $html = $this->html; 00275 } 00276 if ( wfIsWindows() ) { 00277 // Appears to be cleanup for CRLF misprocessing of unknown origin 00278 // when running server on Windows platform. 00279 // 00280 // If this error continues in the future, please track it down in the 00281 // XML code paths if possible and fix there. 00282 $html = str_replace( ' ', '', $html ); 00283 } 00284 $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html ); 00285 $html = $this->onHtmlReady( $html ); 00286 00287 if ( $this->elementsToFlatten ) { 00288 $elements = implode( '|', $this->elementsToFlatten ); 00289 $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html ); 00290 } 00291 00292 wfProfileOut( __METHOD__ ); 00293 return $html; 00294 } 00295 00302 protected function parseSelector( $selector, &$type, &$rawName ) { 00303 if ( strpos( $selector, '.' ) === 0 ) { 00304 $type = 'CLASS'; 00305 $rawName = substr( $selector, 1 ); 00306 } elseif ( strpos( $selector, '#' ) === 0 ) { 00307 $type = 'ID'; 00308 $rawName = substr( $selector, 1 ); 00309 } elseif ( strpos( $selector, '.' ) !== 0 && 00310 strpos( $selector, '.' ) !== false ) 00311 { 00312 $type = 'TAG_CLASS'; 00313 $rawName = $selector; 00314 } elseif ( strpos( $selector, '[' ) === false 00315 && strpos( $selector, ']' ) === false ) 00316 { 00317 $type = 'TAG'; 00318 $rawName = $selector; 00319 } else { 00320 throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" ); 00321 } 00322 00323 return true; 00324 } 00325 00330 protected function parseItemsToRemove() { 00331 wfProfileIn( __METHOD__ ); 00332 $removals = array( 00333 'ID' => array(), 00334 'TAG' => array(), 00335 'CLASS' => array(), 00336 'TAG_CLASS' => array(), 00337 ); 00338 00339 foreach ( $this->itemsToRemove as $itemToRemove ) { 00340 $type = ''; 00341 $rawName = ''; 00342 if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) { 00343 $removals[$type][] = $rawName; 00344 } 00345 } 00346 00347 if ( $this->removeMedia ) { 00348 $removals['TAG'][] = 'img'; 00349 $removals['TAG'][] = 'audio'; 00350 $removals['TAG'][] = 'video'; 00351 } 00352 00353 wfProfileOut( __METHOD__ ); 00354 return $removals; 00355 } 00356 }