MediaWiki
REL1_22
|
00001 <?php 00031 class SearchUpdate implements DeferrableUpdate { 00036 private $id = 0; 00037 00042 private $title; 00043 00048 private $content; 00049 00059 public function __construct( $id, $title, $c = false ) { 00060 if ( is_string( $title ) ) { 00061 $nt = Title::newFromText( $title ); 00062 } else { 00063 $nt = $title; 00064 } 00065 00066 if ( $nt ) { 00067 $this->id = $id; 00068 // is_string() check is back-compat for ApprovedRevs 00069 if ( is_string( $c ) ) { 00070 $this->content = new TextContent( $c ); 00071 } else { 00072 $this->content = $c ?: false; 00073 } 00074 $this->title = $nt; 00075 } else { 00076 wfDebug( "SearchUpdate object created with invalid title '$title'\n" ); 00077 } 00078 } 00079 00083 public function doUpdate() { 00084 global $wgDisableSearchUpdate; 00085 00086 if ( $wgDisableSearchUpdate || !$this->id ) { 00087 return; 00088 } 00089 00090 wfProfileIn( __METHOD__ ); 00091 00092 $page = WikiPage::newFromId( $this->id, WikiPage::READ_LATEST ); 00093 $indexTitle = Title::indexTitle( $this->title->getNamespace(), $this->title->getText() ); 00094 00095 foreach ( SearchEngine::getSearchTypes() as $type ) { 00096 $search = SearchEngine::create( $type ); 00097 if ( !$search->supports( 'search-update' ) ) { 00098 continue; 00099 } 00100 00101 $normalTitle = $search->normalizeText( $indexTitle ); 00102 00103 if ( $page === null ) { 00104 $search->delete( $this->id, $normalTitle ); 00105 continue; 00106 } elseif ( $this->content === false ) { 00107 $search->updateTitle( $this->id, $normalTitle ); 00108 continue; 00109 } 00110 00111 $text = $search->getTextFromContent( $this->title, $this->content ); 00112 if ( !$search->textAlreadyUpdatedForIndex() ) { 00113 $text = self::updateText( $text ); 00114 } 00115 00116 # Perform the actual update 00117 $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) ); 00118 } 00119 00120 wfProfileOut( __METHOD__ ); 00121 } 00122 00128 public static function updateText( $text ) { 00129 global $wgContLang; 00130 00131 # Language-specific strip/conversion 00132 $text = $wgContLang->normalizeForSearch( $text ); 00133 $lc = SearchEngine::legalSearchChars() . '&#;'; 00134 00135 wfProfileIn( __METHOD__ . '-regexps' ); 00136 $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", 00137 ' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup 00138 $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD", 00139 "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings 00140 00141 # Strip external URLs 00142 $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF"; 00143 $protos = "http|https|ftp|mailto|news|gopher"; 00144 $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/"; 00145 $text = preg_replace( $pat, "\\1 \\3", $text ); 00146 00147 $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; 00148 $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; 00149 $text = preg_replace( $p1, "\\1 ", $text ); 00150 $text = preg_replace( $p2, "\\1 \\3 ", $text ); 00151 00152 # Internal image links 00153 $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i"; 00154 $text = preg_replace( $pat2, " \\1 \\3", $text ); 00155 00156 $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/", 00157 "\\1\\2 \\2\\3", $text ); # Handle [[game]]s 00158 00159 # Strip all remaining non-search characters 00160 $text = preg_replace( "/[^{$lc}]+/", " ", $text ); 00161 00162 # Handle 's, s' 00163 # 00164 # $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); 00165 # $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); 00166 # 00167 # These tail-anchored regexps are insanely slow. The worst case comes 00168 # when Japanese or Chinese text (ie, no word spacing) is written on 00169 # a wiki configured for Western UTF-8 mode. The Unicode characters are 00170 # expanded to hex codes and the "words" are very long paragraph-length 00171 # monstrosities. On a large page the above regexps may take over 20 00172 # seconds *each* on a 1GHz-level processor. 00173 # 00174 # Following are reversed versions which are consistently fast 00175 # (about 3 milliseconds on 1GHz-level processor). 00176 # 00177 $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) ); 00178 $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) ); 00179 00180 # Strip wiki '' and ''' 00181 $text = preg_replace( "/''[']*/", " ", $text ); 00182 wfProfileOut( __METHOD__ . '-regexps' ); 00183 return $text; 00184 } 00185 }