MediaWiki  REL1_24
MWTidy.php
Go to the documentation of this file.
00001 <?php
00036 class MWTidyWrapper {
00037 
00041     protected $mTokens;
00042 
00043     protected $mUniqPrefix;
00044 
00045     protected $mMarkerIndex;
00046 
00047     public function __construct() {
00048         $this->mTokens = null;
00049         $this->mUniqPrefix = null;
00050     }
00051 
00056     public function getWrapped( $text ) {
00057         $this->mTokens = new ReplacementArray;
00058         $this->mUniqPrefix = "\x7fUNIQ" .
00059             dechex( mt_rand( 0, 0x7fffffff ) ) . dechex( mt_rand( 0, 0x7fffffff ) );
00060         $this->mMarkerIndex = 0;
00061 
00062         // Replace <mw:editsection> elements with placeholders
00063         $wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX,
00064             array( &$this, 'replaceCallback' ), $text );
00065         // ...and <mw:toc> markers
00066         $wrappedtext = preg_replace_callback( '/<\\/?mw:toc>/',
00067             array( &$this, 'replaceCallback' ), $wrappedtext );
00068         // ... and <math> tags
00069         $wrappedtext = preg_replace_callback( '/<math(.*?)<\\/math>/s',
00070             array( &$this, 'replaceCallback' ), $wrappedtext );
00071         // Modify inline Microdata <link> and <meta> elements so they say <html-link> and <html-meta> so
00072         // we can trick Tidy into not stripping them out by including them in tidy's new-empty-tags config
00073         $wrappedtext = preg_replace( '!<(link|meta)([^>]*?)(/{0,1}>)!', '<html-$1$2$3', $wrappedtext );
00074 
00075         // Wrap the whole thing in a doctype and body for Tidy.
00076         $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' .
00077             ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>' .
00078             '<head><title>test</title></head><body>' . $wrappedtext . '</body></html>';
00079 
00080         return $wrappedtext;
00081     }
00082 
00088     public function replaceCallback( $m ) {
00089         $marker = "{$this->mUniqPrefix}-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX;
00090         $this->mMarkerIndex++;
00091         $this->mTokens->setPair( $marker, $m[0] );
00092         return $marker;
00093     }
00094 
00099     public function postprocess( $text ) {
00100         // Revert <html-{link,meta}> back to <{link,meta}>
00101         $text = preg_replace( '!<html-(link|meta)([^>]*?)(/{0,1}>)!', '<$1$2$3', $text );
00102 
00103         // Restore the contents of placeholder tokens
00104         $text = $this->mTokens->replace( $text );
00105 
00106         return $text;
00107     }
00108 
00109 }
00110 
00120 class MWTidy {
00129     public static function tidy( $text ) {
00130         global $wgTidyInternal;
00131 
00132         $wrapper = new MWTidyWrapper;
00133         $wrappedtext = $wrapper->getWrapped( $text );
00134 
00135         $retVal = null;
00136         if ( $wgTidyInternal ) {
00137             $correctedtext = self::execInternalTidy( $wrappedtext, false, $retVal );
00138         } else {
00139             $correctedtext = self::execExternalTidy( $wrappedtext, false, $retVal );
00140         }
00141 
00142         if ( $retVal < 0 ) {
00143             wfDebug( "Possible tidy configuration error!\n" );
00144             return $text . "\n<!-- Tidy was unable to run -->\n";
00145         } elseif ( is_null( $correctedtext ) ) {
00146             wfDebug( "Tidy error detected!\n" );
00147             return $text . "\n<!-- Tidy found serious XHTML errors -->\n";
00148         }
00149 
00150         $correctedtext = $wrapper->postprocess( $correctedtext ); // restore any hidden tokens
00151 
00152         return $correctedtext;
00153     }
00154 
00162     public static function checkErrors( $text, &$errorStr = null ) {
00163         global $wgTidyInternal;
00164 
00165         $retval = 0;
00166         if ( $wgTidyInternal ) {
00167             $errorStr = self::execInternalTidy( $text, true, $retval );
00168         } else {
00169             $errorStr = self::execExternalTidy( $text, true, $retval );
00170         }
00171 
00172         return ( $retval < 0 && $errorStr == '' ) || $retval == 0;
00173     }
00174 
00184     private static function execExternalTidy( $text, $stderr = false, &$retval = null ) {
00185         global $wgTidyConf, $wgTidyBin, $wgTidyOpts;
00186         wfProfileIn( __METHOD__ );
00187 
00188         $cleansource = '';
00189         $opts = ' -utf8';
00190 
00191         if ( $stderr ) {
00192             $descriptorspec = array(
00193                 0 => array( 'pipe', 'r' ),
00194                 1 => array( 'file', wfGetNull(), 'a' ),
00195                 2 => array( 'pipe', 'w' )
00196             );
00197         } else {
00198             $descriptorspec = array(
00199                 0 => array( 'pipe', 'r' ),
00200                 1 => array( 'pipe', 'w' ),
00201                 2 => array( 'file', wfGetNull(), 'a' )
00202             );
00203         }
00204 
00205         $readpipe = $stderr ? 2 : 1;
00206         $pipes = array();
00207 
00208         $process = proc_open(
00209             "$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes );
00210 
00211         //NOTE: At least on linux, the process will be created even if tidy is not installed.
00212         //      This means that missing tidy will be treated as a validation failure.
00213 
00214         if ( is_resource( $process ) ) {
00215             // Theoretically, this style of communication could cause a deadlock
00216             // here. If the stdout buffer fills up, then writes to stdin could
00217             // block. This doesn't appear to happen with tidy, because tidy only
00218             // writes to stdout after it's finished reading from stdin. Search
00219             // for tidyParseStdin and tidySaveStdout in console/tidy.c
00220             fwrite( $pipes[0], $text );
00221             fclose( $pipes[0] );
00222             while ( !feof( $pipes[$readpipe] ) ) {
00223                 $cleansource .= fgets( $pipes[$readpipe], 1024 );
00224             }
00225             fclose( $pipes[$readpipe] );
00226             $retval = proc_close( $process );
00227         } else {
00228             wfWarn( "Unable to start external tidy process" );
00229             $retval = -1;
00230         }
00231 
00232         if ( !$stderr && $cleansource == '' && $text != '' ) {
00233             // Some kind of error happened, so we couldn't get the corrected text.
00234             // Just give up; we'll use the source text and append a warning.
00235             $cleansource = null;
00236         }
00237 
00238         wfProfileOut( __METHOD__ );
00239         return $cleansource;
00240     }
00241 
00251     private static function execInternalTidy( $text, $stderr = false, &$retval = null ) {
00252         global $wgTidyConf, $wgDebugTidy;
00253         wfProfileIn( __METHOD__ );
00254 
00255         if ( !class_exists( 'tidy' ) ) {
00256             wfWarn( "Unable to load internal tidy class." );
00257             $retval = -1;
00258 
00259             wfProfileOut( __METHOD__ );
00260             return null;
00261         }
00262 
00263         $tidy = new tidy;
00264         $tidy->parseString( $text, $wgTidyConf, 'utf8' );
00265 
00266         if ( $stderr ) {
00267             $retval = $tidy->getStatus();
00268 
00269             wfProfileOut( __METHOD__ );
00270             return $tidy->errorBuffer;
00271         }
00272 
00273         $tidy->cleanRepair();
00274         $retval = $tidy->getStatus();
00275         if ( $retval == 2 ) {
00276             // 2 is magic number for fatal error
00277             // http://www.php.net/manual/en/function.tidy-get-status.php
00278             $cleansource = null;
00279         } else {
00280             $cleansource = tidy_get_output( $tidy );
00281             if ( $wgDebugTidy && $retval > 0 ) {
00282                 $cleansource .= "<!--\nTidy reports:\n" .
00283                     str_replace( '-->', '--&gt;', $tidy->errorBuffer ) .
00284                     "\n-->";
00285             }
00286         }
00287 
00288         wfProfileOut( __METHOD__ );
00289         return $cleansource;
00290     }
00291 }