MediaWiki
REL1_23
|
00001 <?php 00031 class SearchUpdate implements DeferrableUpdate { 00033 private $id = 0; 00034 00036 private $title; 00037 00039 private $content; 00040 00050 public function __construct( $id, $title, $c = false ) { 00051 if ( is_string( $title ) ) { 00052 $nt = Title::newFromText( $title ); 00053 } else { 00054 $nt = $title; 00055 } 00056 00057 if ( $nt ) { 00058 $this->id = $id; 00059 // is_string() check is back-compat for ApprovedRevs 00060 if ( is_string( $c ) ) { 00061 $this->content = new TextContent( $c ); 00062 } else { 00063 $this->content = $c ?: false; 00064 } 00065 $this->title = $nt; 00066 } else { 00067 wfDebug( "SearchUpdate object created with invalid title '$title'\n" ); 00068 } 00069 } 00070 00074 public function doUpdate() { 00075 global $wgDisableSearchUpdate; 00076 00077 if ( $wgDisableSearchUpdate || !$this->id ) { 00078 return; 00079 } 00080 00081 wfProfileIn( __METHOD__ ); 00082 00083 $page = WikiPage::newFromId( $this->id, WikiPage::READ_LATEST ); 00084 $indexTitle = Title::indexTitle( $this->title->getNamespace(), $this->title->getText() ); 00085 00086 foreach ( SearchEngine::getSearchTypes() as $type ) { 00087 $search = SearchEngine::create( $type ); 00088 if ( !$search->supports( 'search-update' ) ) { 00089 continue; 00090 } 00091 00092 $normalTitle = $search->normalizeText( $indexTitle ); 00093 00094 if ( $page === null ) { 00095 $search->delete( $this->id, $normalTitle ); 00096 continue; 00097 } elseif ( $this->content === false ) { 00098 $search->updateTitle( $this->id, $normalTitle ); 00099 continue; 00100 } 00101 00102 $text = $search->getTextFromContent( $this->title, $this->content ); 00103 if ( !$search->textAlreadyUpdatedForIndex() ) { 00104 $text = self::updateText( $text ); 00105 } 00106 00107 # Perform the actual update 00108 $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) ); 00109 } 00110 00111 wfProfileOut( __METHOD__ ); 00112 } 00113 00119 public static function updateText( $text ) { 00120 global $wgContLang; 00121 00122 # Language-specific strip/conversion 00123 $text = $wgContLang->normalizeForSearch( $text ); 00124 $lc = SearchEngine::legalSearchChars() . '&#;'; 00125 00126 wfProfileIn( __METHOD__ . '-regexps' ); 00127 $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", 00128 ' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup 00129 $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD", 00130 "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings 00131 00132 # Strip external URLs 00133 $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF"; 00134 $protos = "http|https|ftp|mailto|news|gopher"; 00135 $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/"; 00136 $text = preg_replace( $pat, "\\1 \\3", $text ); 00137 00138 $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; 00139 $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; 00140 $text = preg_replace( $p1, "\\1 ", $text ); 00141 $text = preg_replace( $p2, "\\1 \\3 ", $text ); 00142 00143 # Internal image links 00144 $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i"; 00145 $text = preg_replace( $pat2, " \\1 \\3", $text ); 00146 00147 $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/", 00148 "\\1\\2 \\2\\3", $text ); # Handle [[game]]s 00149 00150 # Strip all remaining non-search characters 00151 $text = preg_replace( "/[^{$lc}]+/", " ", $text ); 00152 00153 # Handle 's, s' 00154 # 00155 # $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); 00156 # $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); 00157 # 00158 # These tail-anchored regexps are insanely slow. The worst case comes 00159 # when Japanese or Chinese text (ie, no word spacing) is written on 00160 # a wiki configured for Western UTF-8 mode. The Unicode characters are 00161 # expanded to hex codes and the "words" are very long paragraph-length 00162 # monstrosities. On a large page the above regexps may take over 20 00163 # seconds *each* on a 1GHz-level processor. 00164 # 00165 # Following are reversed versions which are consistently fast 00166 # (about 3 milliseconds on 1GHz-level processor). 00167 # 00168 $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) ); 00169 $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) ); 00170 00171 # Strip wiki '' and ''' 00172 $text = preg_replace( "/''[']*/", " ", $text ); 00173 wfProfileOut( __METHOD__ . '-regexps' ); 00174 00175 return $text; 00176 } 00177 }