MediaWiki  REL1_24
SearchUpdate.php
Go to the documentation of this file.
00001 <?php
00031 class SearchUpdate implements DeferrableUpdate {
00033     private $id = 0;
00034 
00036     private $title;
00037 
00039     private $content;
00040 
00050     public function __construct( $id, $title, $c = false ) {
00051         if ( is_string( $title ) ) {
00052             $nt = Title::newFromText( $title );
00053         } else {
00054             $nt = $title;
00055         }
00056 
00057         if ( $nt ) {
00058             $this->id = $id;
00059             // is_string() check is back-compat for ApprovedRevs
00060             if ( is_string( $c ) ) {
00061                 $this->content = new TextContent( $c );
00062             } else {
00063                 $this->content = $c ?: false;
00064             }
00065             $this->title = $nt;
00066         } else {
00067             wfDebug( "SearchUpdate object created with invalid title '$title'\n" );
00068         }
00069     }
00070 
00074     public function doUpdate() {
00075         global $wgDisableSearchUpdate;
00076 
00077         if ( $wgDisableSearchUpdate || !$this->id ) {
00078             return;
00079         }
00080 
00081         wfProfileIn( __METHOD__ );
00082 
00083         $page = WikiPage::newFromId( $this->id, WikiPage::READ_LATEST );
00084 
00085         foreach ( SearchEngine::getSearchTypes() as $type ) {
00086             $search = SearchEngine::create( $type );
00087             $indexTitle = $this->indexTitle( $search );
00088             if ( !$search->supports( 'search-update' ) ) {
00089                 continue;
00090             }
00091 
00092             $normalTitle = $search->normalizeText( $indexTitle );
00093 
00094             if ( $page === null ) {
00095                 $search->delete( $this->id, $normalTitle );
00096                 continue;
00097             } elseif ( $this->content === false ) {
00098                 $search->updateTitle( $this->id, $normalTitle );
00099                 continue;
00100             }
00101 
00102             $text = $search->getTextFromContent( $this->title, $this->content );
00103             if ( !$search->textAlreadyUpdatedForIndex() ) {
00104                 $text = self::updateText( $text );
00105             }
00106 
00107             # Perform the actual update
00108             $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
00109         }
00110 
00111         wfProfileOut( __METHOD__ );
00112     }
00113 
00121     public static function updateText( $text ) {
00122         global $wgContLang;
00123 
00124         # Language-specific strip/conversion
00125         $text = $wgContLang->normalizeForSearch( $text );
00126         $lc = SearchEngine::legalSearchChars() . '&#;';
00127 
00128         wfProfileIn( __METHOD__ . '-regexps' );
00129         $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
00130             ' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup
00131         $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD",
00132             "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
00133 
00134         # Strip external URLs
00135         $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
00136         $protos = "http|https|ftp|mailto|news|gopher";
00137         $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
00138         $text = preg_replace( $pat, "\\1 \\3", $text );
00139 
00140         $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
00141         $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
00142         $text = preg_replace( $p1, "\\1 ", $text );
00143         $text = preg_replace( $p2, "\\1 \\3 ", $text );
00144 
00145         # Internal image links
00146         $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i";
00147         $text = preg_replace( $pat2, " \\1 \\3", $text );
00148 
00149         $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
00150             "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
00151 
00152         # Strip all remaining non-search characters
00153         $text = preg_replace( "/[^{$lc}]+/", " ", $text );
00154 
00155         # Handle 's, s'
00156         #
00157         #   $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text );
00158         #   $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text );
00159         #
00160         # These tail-anchored regexps are insanely slow. The worst case comes
00161         # when Japanese or Chinese text (ie, no word spacing) is written on
00162         # a wiki configured for Western UTF-8 mode. The Unicode characters are
00163         # expanded to hex codes and the "words" are very long paragraph-length
00164         # monstrosities. On a large page the above regexps may take over 20
00165         # seconds *each* on a 1GHz-level processor.
00166         #
00167         # Following are reversed versions which are consistently fast
00168         # (about 3 milliseconds on 1GHz-level processor).
00169         #
00170         $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
00171         $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
00172 
00173         # Strip wiki '' and '''
00174         $text = preg_replace( "/''[']*/", " ", $text );
00175         wfProfileOut( __METHOD__ . '-regexps' );
00176 
00177         return $text;
00178     }
00179 
00187     private function indexTitle( SearchEngine $search ) {
00188         global $wgContLang;
00189 
00190         $ns = $this->title->getNamespace();
00191         $title = $this->title->getText();
00192 
00193         $lc = $search->legalSearchChars() . '&#;';
00194         $t = $wgContLang->normalizeForSearch( $title );
00195         $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
00196         $t = $wgContLang->lc( $t );
00197 
00198         # Handle 's, s'
00199         $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t );
00200         $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t );
00201 
00202         $t = preg_replace( "/\\s+/", ' ', $t );
00203 
00204         if ( $ns == NS_FILE ) {
00205             $t = preg_replace( "/ (png|gif|jpg|jpeg|ogg)$/", "", $t );
00206         }
00207         return trim( $t );
00208     }
00209 }