MediaWiki
REL1_20
|
00001 <?php 00031 class SearchUpdate implements DeferrableUpdate { 00032 00033 private $mId = 0, $mNamespace, $mTitle, $mText; 00034 private $mTitleWords; 00035 00036 function __construct( $id, $title, $text = false ) { 00037 $nt = Title::newFromText( $title ); 00038 if( $nt ) { 00039 $this->mId = $id; 00040 $this->mText = $text; 00041 00042 $this->mNamespace = $nt->getNamespace(); 00043 $this->mTitle = $nt->getText(); # Discard namespace 00044 00045 $this->mTitleWords = $this->mTextWords = array(); 00046 } else { 00047 wfDebug( "SearchUpdate object created with invalid title '$title'\n" ); 00048 } 00049 } 00050 00051 function doUpdate() { 00052 global $wgContLang, $wgDisableSearchUpdate; 00053 00054 if( $wgDisableSearchUpdate || !$this->mId ) { 00055 return; 00056 } 00057 00058 wfProfileIn( __METHOD__ ); 00059 00060 $search = SearchEngine::create(); 00061 $lc = SearchEngine::legalSearchChars() . '&#;'; 00062 00063 if( $this->mText === false ) { 00064 $search->updateTitle($this->mId, 00065 $search->normalizeText( Title::indexTitle( $this->mNamespace, $this->mTitle ) ) ); 00066 wfProfileOut( __METHOD__ ); 00067 return; 00068 } 00069 00070 # Language-specific strip/conversion 00071 $text = $wgContLang->normalizeForSearch( $this->mText ); 00072 00073 wfProfileIn( __METHOD__ . '-regexps' ); 00074 $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", 00075 ' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup 00076 $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD", 00077 "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings 00078 00079 # Strip external URLs 00080 $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF"; 00081 $protos = "http|https|ftp|mailto|news|gopher"; 00082 $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/"; 00083 $text = preg_replace( $pat, "\\1 \\3", $text ); 00084 00085 $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; 00086 $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; 00087 $text = preg_replace( $p1, "\\1 ", $text ); 00088 $text = preg_replace( $p2, "\\1 \\3 ", $text ); 00089 00090 # Internal image links 00091 $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i"; 00092 $text = preg_replace( $pat2, " \\1 \\3", $text ); 00093 00094 $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/", 00095 "\\1\\2 \\2\\3", $text ); # Handle [[game]]s 00096 00097 # Strip all remaining non-search characters 00098 $text = preg_replace( "/[^{$lc}]+/", " ", $text ); 00099 00100 # Handle 's, s' 00101 # 00102 # $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); 00103 # $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); 00104 # 00105 # These tail-anchored regexps are insanely slow. The worst case comes 00106 # when Japanese or Chinese text (ie, no word spacing) is written on 00107 # a wiki configured for Western UTF-8 mode. The Unicode characters are 00108 # expanded to hex codes and the "words" are very long paragraph-length 00109 # monstrosities. On a large page the above regexps may take over 20 00110 # seconds *each* on a 1GHz-level processor. 00111 # 00112 # Following are reversed versions which are consistently fast 00113 # (about 3 milliseconds on 1GHz-level processor). 00114 # 00115 $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) ); 00116 $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) ); 00117 00118 # Strip wiki '' and ''' 00119 $text = preg_replace( "/''[']*/", " ", $text ); 00120 wfProfileOut( __METHOD__ . '-regexps' ); 00121 00122 wfRunHooks( 'SearchUpdate', array( $this->mId, $this->mNamespace, $this->mTitle, &$text ) ); 00123 00124 # Perform the actual update 00125 $search->update($this->mId, $search->normalizeText( Title::indexTitle( $this->mNamespace, $this->mTitle ) ), 00126 $search->normalizeText( $text ) ); 00127 00128 wfProfileOut( __METHOD__ ); 00129 } 00130 } 00131 00137 class SearchUpdateMyISAM extends SearchUpdate { 00138 # Inherits everything 00139 }