MediaWiki
REL1_24
|
00001 <?php 00031 class SearchUpdate implements DeferrableUpdate { 00033 private $id = 0; 00034 00036 private $title; 00037 00039 private $content; 00040 00050 public function __construct( $id, $title, $c = false ) { 00051 if ( is_string( $title ) ) { 00052 $nt = Title::newFromText( $title ); 00053 } else { 00054 $nt = $title; 00055 } 00056 00057 if ( $nt ) { 00058 $this->id = $id; 00059 // is_string() check is back-compat for ApprovedRevs 00060 if ( is_string( $c ) ) { 00061 $this->content = new TextContent( $c ); 00062 } else { 00063 $this->content = $c ?: false; 00064 } 00065 $this->title = $nt; 00066 } else { 00067 wfDebug( "SearchUpdate object created with invalid title '$title'\n" ); 00068 } 00069 } 00070 00074 public function doUpdate() { 00075 global $wgDisableSearchUpdate; 00076 00077 if ( $wgDisableSearchUpdate || !$this->id ) { 00078 return; 00079 } 00080 00081 wfProfileIn( __METHOD__ ); 00082 00083 $page = WikiPage::newFromId( $this->id, WikiPage::READ_LATEST ); 00084 00085 foreach ( SearchEngine::getSearchTypes() as $type ) { 00086 $search = SearchEngine::create( $type ); 00087 $indexTitle = $this->indexTitle( $search ); 00088 if ( !$search->supports( 'search-update' ) ) { 00089 continue; 00090 } 00091 00092 $normalTitle = $search->normalizeText( $indexTitle ); 00093 00094 if ( $page === null ) { 00095 $search->delete( $this->id, $normalTitle ); 00096 continue; 00097 } elseif ( $this->content === false ) { 00098 $search->updateTitle( $this->id, $normalTitle ); 00099 continue; 00100 } 00101 00102 $text = $search->getTextFromContent( $this->title, $this->content ); 00103 if ( !$search->textAlreadyUpdatedForIndex() ) { 00104 $text = self::updateText( $text ); 00105 } 00106 00107 # Perform the actual update 00108 $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) ); 00109 } 00110 00111 wfProfileOut( __METHOD__ ); 00112 } 00113 00121 public static function updateText( $text ) { 00122 global $wgContLang; 00123 00124 # Language-specific strip/conversion 00125 $text = $wgContLang->normalizeForSearch( $text ); 00126 $lc = SearchEngine::legalSearchChars() . '&#;'; 00127 00128 wfProfileIn( __METHOD__ . '-regexps' ); 00129 $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", 00130 ' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup 00131 $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD", 00132 "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings 00133 00134 # Strip external URLs 00135 $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF"; 00136 $protos = "http|https|ftp|mailto|news|gopher"; 00137 $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/"; 00138 $text = preg_replace( $pat, "\\1 \\3", $text ); 00139 00140 $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; 00141 $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; 00142 $text = preg_replace( $p1, "\\1 ", $text ); 00143 $text = preg_replace( $p2, "\\1 \\3 ", $text ); 00144 00145 # Internal image links 00146 $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i"; 00147 $text = preg_replace( $pat2, " \\1 \\3", $text ); 00148 00149 $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/", 00150 "\\1\\2 \\2\\3", $text ); # Handle [[game]]s 00151 00152 # Strip all remaining non-search characters 00153 $text = preg_replace( "/[^{$lc}]+/", " ", $text ); 00154 00155 # Handle 's, s' 00156 # 00157 # $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); 00158 # $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); 00159 # 00160 # These tail-anchored regexps are insanely slow. The worst case comes 00161 # when Japanese or Chinese text (ie, no word spacing) is written on 00162 # a wiki configured for Western UTF-8 mode. The Unicode characters are 00163 # expanded to hex codes and the "words" are very long paragraph-length 00164 # monstrosities. On a large page the above regexps may take over 20 00165 # seconds *each* on a 1GHz-level processor. 00166 # 00167 # Following are reversed versions which are consistently fast 00168 # (about 3 milliseconds on 1GHz-level processor). 00169 # 00170 $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) ); 00171 $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) ); 00172 00173 # Strip wiki '' and ''' 00174 $text = preg_replace( "/''[']*/", " ", $text ); 00175 wfProfileOut( __METHOD__ . '-regexps' ); 00176 00177 return $text; 00178 } 00179 00187 private function indexTitle( SearchEngine $search ) { 00188 global $wgContLang; 00189 00190 $ns = $this->title->getNamespace(); 00191 $title = $this->title->getText(); 00192 00193 $lc = $search->legalSearchChars() . '&#;'; 00194 $t = $wgContLang->normalizeForSearch( $title ); 00195 $t = preg_replace( "/[^{$lc}]+/", ' ', $t ); 00196 $t = $wgContLang->lc( $t ); 00197 00198 # Handle 's, s' 00199 $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t ); 00200 $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t ); 00201 00202 $t = preg_replace( "/\\s+/", ' ', $t ); 00203 00204 if ( $ns == NS_FILE ) { 00205 $t = preg_replace( "/ (png|gif|jpg|jpeg|ogg)$/", "", $t ); 00206 } 00207 return trim( $t ); 00208 } 00209 }