MediaWiki  REL1_23
SearchUpdate.php
Go to the documentation of this file.
00001 <?php
00031 class SearchUpdate implements DeferrableUpdate {
00033     private $id = 0;
00034 
00036     private $title;
00037 
00039     private $content;
00040 
00050     public function __construct( $id, $title, $c = false ) {
00051         if ( is_string( $title ) ) {
00052             $nt = Title::newFromText( $title );
00053         } else {
00054             $nt = $title;
00055         }
00056 
00057         if ( $nt ) {
00058             $this->id = $id;
00059             // is_string() check is back-compat for ApprovedRevs
00060             if ( is_string( $c ) ) {
00061                 $this->content = new TextContent( $c );
00062             } else {
00063                 $this->content = $c ?: false;
00064             }
00065             $this->title = $nt;
00066         } else {
00067             wfDebug( "SearchUpdate object created with invalid title '$title'\n" );
00068         }
00069     }
00070 
00074     public function doUpdate() {
00075         global $wgDisableSearchUpdate;
00076 
00077         if ( $wgDisableSearchUpdate || !$this->id ) {
00078             return;
00079         }
00080 
00081         wfProfileIn( __METHOD__ );
00082 
00083         $page = WikiPage::newFromId( $this->id, WikiPage::READ_LATEST );
00084         $indexTitle = Title::indexTitle( $this->title->getNamespace(), $this->title->getText() );
00085 
00086         foreach ( SearchEngine::getSearchTypes() as $type ) {
00087             $search = SearchEngine::create( $type );
00088             if ( !$search->supports( 'search-update' ) ) {
00089                 continue;
00090             }
00091 
00092             $normalTitle = $search->normalizeText( $indexTitle );
00093 
00094             if ( $page === null ) {
00095                 $search->delete( $this->id, $normalTitle );
00096                 continue;
00097             } elseif ( $this->content === false ) {
00098                 $search->updateTitle( $this->id, $normalTitle );
00099                 continue;
00100             }
00101 
00102             $text = $search->getTextFromContent( $this->title, $this->content );
00103             if ( !$search->textAlreadyUpdatedForIndex() ) {
00104                 $text = self::updateText( $text );
00105             }
00106 
00107             # Perform the actual update
00108             $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
00109         }
00110 
00111         wfProfileOut( __METHOD__ );
00112     }
00113 
00119     public static function updateText( $text ) {
00120         global $wgContLang;
00121 
00122         # Language-specific strip/conversion
00123         $text = $wgContLang->normalizeForSearch( $text );
00124         $lc = SearchEngine::legalSearchChars() . '&#;';
00125 
00126         wfProfileIn( __METHOD__ . '-regexps' );
00127         $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
00128             ' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup
00129         $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD",
00130             "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
00131 
00132         # Strip external URLs
00133         $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
00134         $protos = "http|https|ftp|mailto|news|gopher";
00135         $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
00136         $text = preg_replace( $pat, "\\1 \\3", $text );
00137 
00138         $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
00139         $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
00140         $text = preg_replace( $p1, "\\1 ", $text );
00141         $text = preg_replace( $p2, "\\1 \\3 ", $text );
00142 
00143         # Internal image links
00144         $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i";
00145         $text = preg_replace( $pat2, " \\1 \\3", $text );
00146 
00147         $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
00148             "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
00149 
00150         # Strip all remaining non-search characters
00151         $text = preg_replace( "/[^{$lc}]+/", " ", $text );
00152 
00153         # Handle 's, s'
00154         #
00155         #   $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text );
00156         #   $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text );
00157         #
00158         # These tail-anchored regexps are insanely slow. The worst case comes
00159         # when Japanese or Chinese text (ie, no word spacing) is written on
00160         # a wiki configured for Western UTF-8 mode. The Unicode characters are
00161         # expanded to hex codes and the "words" are very long paragraph-length
00162         # monstrosities. On a large page the above regexps may take over 20
00163         # seconds *each* on a 1GHz-level processor.
00164         #
00165         # Following are reversed versions which are consistently fast
00166         # (about 3 milliseconds on 1GHz-level processor).
00167         #
00168         $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
00169         $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
00170 
00171         # Strip wiki '' and '''
00172         $text = preg_replace( "/''[']*/", " ", $text );
00173         wfProfileOut( __METHOD__ . '-regexps' );
00174 
00175         return $text;
00176     }
00177 }