MediaWiki  REL1_22
Tidy.php
Go to the documentation of this file.
00001 <?php
00036 class MWTidyWrapper {
00037 
00041     protected $mTokens;
00042 
00043     protected $mUniqPrefix;
00044 
00045     protected $mMarkerIndex;
00046 
00047     public function __construct() {
00048         $this->mTokens = null;
00049         $this->mUniqPrefix = null;
00050     }
00051 
00056     public function getWrapped( $text ) {
00057         $this->mTokens = new ReplacementArray;
00058         $this->mUniqPrefix = "\x7fUNIQ" .
00059             dechex( mt_rand( 0, 0x7fffffff ) ) . dechex( mt_rand( 0, 0x7fffffff ) );
00060         $this->mMarkerIndex = 0;
00061 
00062         // Replace <mw:editsection> elements with placeholders
00063         $wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX,
00064             array( &$this, 'replaceCallback' ), $text );
00065         // ...and <mw:toc> markers
00066         $wrappedtext = preg_replace_callback( '/<\\/?mw:toc>/',
00067             array( &$this, 'replaceCallback' ), $wrappedtext );
00068 
00069         // Modify inline Microdata <link> and <meta> elements so they say <html-link> and <html-meta> so
00070         // we can trick Tidy into not stripping them out by including them in tidy's new-empty-tags config
00071         $wrappedtext = preg_replace( '!<(link|meta)([^>]*?)(/{0,1}>)!', '<html-$1$2$3', $wrappedtext );
00072 
00073         // Wrap the whole thing in a doctype and body for Tidy.
00074         $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' .
00075             ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>' .
00076             '<head><title>test</title></head><body>' . $wrappedtext . '</body></html>';
00077 
00078         return $wrappedtext;
00079     }
00080 
00086     function replaceCallback( $m ) {
00087         $marker = "{$this->mUniqPrefix}-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX;
00088         $this->mMarkerIndex++;
00089         $this->mTokens->setPair( $marker, $m[0] );
00090         return $marker;
00091     }
00092 
00097     public function postprocess( $text ) {
00098         // Revert <html-{link,meta}> back to <{link,meta}>
00099         $text = preg_replace( '!<html-(link|meta)([^>]*?)(/{0,1}>)!', '<$1$2$3', $text );
00100 
00101         // Restore the contents of placeholder tokens
00102         $text = $this->mTokens->replace( $text );
00103 
00104         return $text;
00105     }
00106 
00107 }
00108 
00118 class MWTidy {
00127     public static function tidy( $text ) {
00128         global $wgTidyInternal;
00129 
00130         $wrapper = new MWTidyWrapper;
00131         $wrappedtext = $wrapper->getWrapped( $text );
00132 
00133         $retVal = null;
00134         if ( $wgTidyInternal ) {
00135             $correctedtext = self::execInternalTidy( $wrappedtext, false, $retVal );
00136         } else {
00137             $correctedtext = self::execExternalTidy( $wrappedtext, false, $retVal );
00138         }
00139 
00140         if ( $retVal < 0 ) {
00141             wfDebug( "Possible tidy configuration error!\n" );
00142             return $text . "\n<!-- Tidy was unable to run -->\n";
00143         } elseif ( is_null( $correctedtext ) ) {
00144             wfDebug( "Tidy error detected!\n" );
00145             return $text . "\n<!-- Tidy found serious XHTML errors -->\n";
00146         }
00147 
00148         $correctedtext = $wrapper->postprocess( $correctedtext ); // restore any hidden tokens
00149 
00150         return $correctedtext;
00151     }
00152 
00160     public static function checkErrors( $text, &$errorStr = null ) {
00161         global $wgTidyInternal;
00162 
00163         $retval = 0;
00164         if ( $wgTidyInternal ) {
00165             $errorStr = self::execInternalTidy( $text, true, $retval );
00166         } else {
00167             $errorStr = self::execExternalTidy( $text, true, $retval );
00168         }
00169 
00170         return ( $retval < 0 && $errorStr == '' ) || $retval == 0;
00171     }
00172 
00182     private static function execExternalTidy( $text, $stderr = false, &$retval = null ) {
00183         global $wgTidyConf, $wgTidyBin, $wgTidyOpts;
00184         wfProfileIn( __METHOD__ );
00185 
00186         $cleansource = '';
00187         $opts = ' -utf8';
00188 
00189         if ( $stderr ) {
00190             $descriptorspec = array(
00191                 0 => array( 'pipe', 'r' ),
00192                 1 => array( 'file', wfGetNull(), 'a' ),
00193                 2 => array( 'pipe', 'w' )
00194             );
00195         } else {
00196             $descriptorspec = array(
00197                 0 => array( 'pipe', 'r' ),
00198                 1 => array( 'pipe', 'w' ),
00199                 2 => array( 'file', wfGetNull(), 'a' )
00200             );
00201         }
00202 
00203         $readpipe = $stderr ? 2 : 1;
00204         $pipes = array();
00205 
00206         $process = proc_open(
00207             "$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes );
00208 
00209         if ( is_resource( $process ) ) {
00210             // Theoretically, this style of communication could cause a deadlock
00211             // here. If the stdout buffer fills up, then writes to stdin could
00212             // block. This doesn't appear to happen with tidy, because tidy only
00213             // writes to stdout after it's finished reading from stdin. Search
00214             // for tidyParseStdin and tidySaveStdout in console/tidy.c
00215             fwrite( $pipes[0], $text );
00216             fclose( $pipes[0] );
00217             while ( !feof( $pipes[$readpipe] ) ) {
00218                 $cleansource .= fgets( $pipes[$readpipe], 1024 );
00219             }
00220             fclose( $pipes[$readpipe] );
00221             $retval = proc_close( $process );
00222         } else {
00223             wfWarn( "Unable to start external tidy process" );
00224             $retval = -1;
00225         }
00226 
00227         if ( !$stderr && $cleansource == '' && $text != '' ) {
00228             // Some kind of error happened, so we couldn't get the corrected text.
00229             // Just give up; we'll use the source text and append a warning.
00230             $cleansource = null;
00231         }
00232 
00233         wfProfileOut( __METHOD__ );
00234         return $cleansource;
00235     }
00236 
00246     private static function execInternalTidy( $text, $stderr = false, &$retval = null ) {
00247         global $wgTidyConf, $wgDebugTidy;
00248         wfProfileIn( __METHOD__ );
00249 
00250         if ( !class_exists( 'tidy' ) ) {
00251             wfWarn( "Unable to load internal tidy class." );
00252             $retval = -1;
00253 
00254             wfProfileOut( __METHOD__ );
00255             return null;
00256         }
00257 
00258         $tidy = new tidy;
00259         $tidy->parseString( $text, $wgTidyConf, 'utf8' );
00260 
00261         if ( $stderr ) {
00262             $retval = $tidy->getStatus();
00263 
00264             wfProfileOut( __METHOD__ );
00265             return $tidy->errorBuffer;
00266         }
00267 
00268         $tidy->cleanRepair();
00269         $retval = $tidy->getStatus();
00270         if ( $retval == 2 ) {
00271             // 2 is magic number for fatal error
00272             // http://www.php.net/manual/en/function.tidy-get-status.php
00273             $cleansource = null;
00274         } else {
00275             $cleansource = tidy_get_output( $tidy );
00276             if ( $wgDebugTidy && $retval > 0 ) {
00277                 $cleansource .= "<!--\nTidy reports:\n" .
00278                     str_replace( '-->', '--&gt;', $tidy->errorBuffer ) .
00279                     "\n-->";
00280             }
00281         }
00282 
00283         wfProfileOut( __METHOD__ );
00284         return $cleansource;
00285     }
00286 }