MediaWiki  REL1_22
SearchUpdate.php
Go to the documentation of this file.
00001 <?php
00031 class SearchUpdate implements DeferrableUpdate {
00036     private $id = 0;
00037 
00042     private $title;
00043 
00048     private $content;
00049 
00059     public function __construct( $id, $title, $c = false ) {
00060         if ( is_string( $title ) ) {
00061             $nt = Title::newFromText( $title );
00062         } else {
00063             $nt = $title;
00064         }
00065 
00066         if ( $nt ) {
00067             $this->id = $id;
00068             // is_string() check is back-compat for ApprovedRevs
00069             if ( is_string( $c ) ) {
00070                 $this->content = new TextContent( $c );
00071             } else {
00072                 $this->content = $c ?: false;
00073             }
00074             $this->title = $nt;
00075         } else {
00076             wfDebug( "SearchUpdate object created with invalid title '$title'\n" );
00077         }
00078     }
00079 
00083     public function doUpdate() {
00084         global $wgDisableSearchUpdate;
00085 
00086         if ( $wgDisableSearchUpdate || !$this->id ) {
00087             return;
00088         }
00089 
00090         wfProfileIn( __METHOD__ );
00091 
00092         $page = WikiPage::newFromId( $this->id, WikiPage::READ_LATEST );
00093         $indexTitle = Title::indexTitle( $this->title->getNamespace(), $this->title->getText() );
00094 
00095         foreach ( SearchEngine::getSearchTypes() as $type ) {
00096             $search = SearchEngine::create( $type );
00097             if ( !$search->supports( 'search-update' ) ) {
00098                 continue;
00099             }
00100 
00101             $normalTitle = $search->normalizeText( $indexTitle );
00102 
00103             if ( $page === null ) {
00104                 $search->delete( $this->id, $normalTitle );
00105                 continue;
00106             } elseif ( $this->content === false ) {
00107                 $search->updateTitle( $this->id, $normalTitle );
00108                 continue;
00109             }
00110 
00111             $text = $search->getTextFromContent( $this->title, $this->content );
00112             if ( !$search->textAlreadyUpdatedForIndex() ) {
00113                 $text = self::updateText( $text );
00114             }
00115 
00116             # Perform the actual update
00117             $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
00118         }
00119 
00120         wfProfileOut( __METHOD__ );
00121     }
00122 
00128     public static function updateText( $text ) {
00129         global $wgContLang;
00130 
00131         # Language-specific strip/conversion
00132         $text = $wgContLang->normalizeForSearch( $text );
00133         $lc = SearchEngine::legalSearchChars() . '&#;';
00134 
00135         wfProfileIn( __METHOD__ . '-regexps' );
00136         $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
00137             ' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup
00138         $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD",
00139             "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
00140 
00141         # Strip external URLs
00142         $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
00143         $protos = "http|https|ftp|mailto|news|gopher";
00144         $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
00145         $text = preg_replace( $pat, "\\1 \\3", $text );
00146 
00147         $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
00148         $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
00149         $text = preg_replace( $p1, "\\1 ", $text );
00150         $text = preg_replace( $p2, "\\1 \\3 ", $text );
00151 
00152         # Internal image links
00153         $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i";
00154         $text = preg_replace( $pat2, " \\1 \\3", $text );
00155 
00156         $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
00157             "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
00158 
00159         # Strip all remaining non-search characters
00160         $text = preg_replace( "/[^{$lc}]+/", " ", $text );
00161 
00162         # Handle 's, s'
00163         #
00164         #   $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text );
00165         #   $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text );
00166         #
00167         # These tail-anchored regexps are insanely slow. The worst case comes
00168         # when Japanese or Chinese text (ie, no word spacing) is written on
00169         # a wiki configured for Western UTF-8 mode. The Unicode characters are
00170         # expanded to hex codes and the "words" are very long paragraph-length
00171         # monstrosities. On a large page the above regexps may take over 20
00172         # seconds *each* on a 1GHz-level processor.
00173         #
00174         # Following are reversed versions which are consistently fast
00175         # (about 3 milliseconds on 1GHz-level processor).
00176         #
00177         $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
00178         $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
00179 
00180         # Strip wiki '' and '''
00181         $text = preg_replace( "/''[']*/", " ", $text );
00182         wfProfileOut( __METHOD__ . '-regexps' );
00183         return $text;
00184     }
00185 }