MediaWiki
REL1_19
|
00001 <?php 00016 class SearchUpdate implements DeferrableUpdate { 00017 00018 private $mId = 0, $mNamespace, $mTitle, $mText; 00019 private $mTitleWords; 00020 00021 function __construct( $id, $title, $text = false ) { 00022 $nt = Title::newFromText( $title ); 00023 if( $nt ) { 00024 $this->mId = $id; 00025 $this->mText = $text; 00026 00027 $this->mNamespace = $nt->getNamespace(); 00028 $this->mTitle = $nt->getText(); # Discard namespace 00029 00030 $this->mTitleWords = $this->mTextWords = array(); 00031 } else { 00032 wfDebug( "SearchUpdate object created with invalid title '$title'\n" ); 00033 } 00034 } 00035 00036 function doUpdate() { 00037 global $wgContLang, $wgDisableSearchUpdate; 00038 00039 if( $wgDisableSearchUpdate || !$this->mId ) { 00040 return; 00041 } 00042 00043 wfProfileIn( __METHOD__ ); 00044 00045 $search = SearchEngine::create(); 00046 $lc = SearchEngine::legalSearchChars() . '&#;'; 00047 00048 if( $this->mText === false ) { 00049 $search->updateTitle($this->mId, 00050 $search->normalizeText( Title::indexTitle( $this->mNamespace, $this->mTitle ) ) ); 00051 wfProfileOut( __METHOD__ ); 00052 return; 00053 } 00054 00055 # Language-specific strip/conversion 00056 $text = $wgContLang->normalizeForSearch( $this->mText ); 00057 00058 wfProfileIn( __METHOD__ . '-regexps' ); 00059 $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", 00060 ' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup 00061 $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD", 00062 "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings 00063 00064 # Strip external URLs 00065 $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF"; 00066 $protos = "http|https|ftp|mailto|news|gopher"; 00067 $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/"; 00068 $text = preg_replace( $pat, "\\1 \\3", $text ); 00069 00070 $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; 00071 $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; 00072 $text = preg_replace( $p1, "\\1 ", $text ); 00073 $text = preg_replace( $p2, "\\1 \\3 ", $text ); 00074 00075 # Internal image links 00076 $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i"; 00077 $text = preg_replace( $pat2, " \\1 \\3", $text ); 00078 00079 $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/", 00080 "\\1\\2 \\2\\3", $text ); # Handle [[game]]s 00081 00082 # Strip all remaining non-search characters 00083 $text = preg_replace( "/[^{$lc}]+/", " ", $text ); 00084 00085 # Handle 's, s' 00086 # 00087 # $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); 00088 # $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); 00089 # 00090 # These tail-anchored regexps are insanely slow. The worst case comes 00091 # when Japanese or Chinese text (ie, no word spacing) is written on 00092 # a wiki configured for Western UTF-8 mode. The Unicode characters are 00093 # expanded to hex codes and the "words" are very long paragraph-length 00094 # monstrosities. On a large page the above regexps may take over 20 00095 # seconds *each* on a 1GHz-level processor. 00096 # 00097 # Following are reversed versions which are consistently fast 00098 # (about 3 milliseconds on 1GHz-level processor). 00099 # 00100 $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) ); 00101 $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) ); 00102 00103 # Strip wiki '' and ''' 00104 $text = preg_replace( "/''[']*/", " ", $text ); 00105 wfProfileOut( __METHOD__ . '-regexps' ); 00106 00107 wfRunHooks( 'SearchUpdate', array( $this->mId, $this->mNamespace, $this->mTitle, &$text ) ); 00108 00109 # Perform the actual update 00110 $search->update($this->mId, $search->normalizeText( Title::indexTitle( $this->mNamespace, $this->mTitle ) ), 00111 $search->normalizeText( $text ) ); 00112 00113 wfProfileOut( __METHOD__ ); 00114 } 00115 } 00116 00122 class SearchUpdateMyISAM extends SearchUpdate { 00123 # Inherits everything 00124 }