MediaWiki
REL1_19
|
00001 <?php 00031 class SearchMySQL extends SearchEngine { 00032 var $strictMatching = true; 00033 static $mMinSearchLength; 00034 00039 function __construct( $db ) { 00040 parent::__construct( $db ); 00041 } 00042 00052 function parseQuery( $filteredText, $fulltext ) { 00053 global $wgContLang; 00054 $lc = SearchEngine::legalSearchChars(); // Minus format chars 00055 $searchon = ''; 00056 $this->searchTerms = array(); 00057 00058 # @todo FIXME: This doesn't handle parenthetical expressions. 00059 $m = array(); 00060 if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', 00061 $filteredText, $m, PREG_SET_ORDER ) ) { 00062 foreach( $m as $bits ) { 00063 @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits; 00064 00065 if( $nonQuoted != '' ) { 00066 $term = $nonQuoted; 00067 $quote = ''; 00068 } else { 00069 $term = str_replace( '"', '', $term ); 00070 $quote = '"'; 00071 } 00072 00073 if( $searchon !== '' ) $searchon .= ' '; 00074 if( $this->strictMatching && ($modifier == '') ) { 00075 // If we leave this out, boolean op defaults to OR which is rarely helpful. 00076 $modifier = '+'; 00077 } 00078 00079 // Some languages such as Serbian store the input form in the search index, 00080 // so we may need to search for matches in multiple writing system variants. 00081 $convertedVariants = $wgContLang->autoConvertToAllVariants( $term ); 00082 if( is_array( $convertedVariants ) ) { 00083 $variants = array_unique( array_values( $convertedVariants ) ); 00084 } else { 00085 $variants = array( $term ); 00086 } 00087 00088 // The low-level search index does some processing on input to work 00089 // around problems with minimum lengths and encoding in MySQL's 00090 // fulltext engine. 00091 // For Chinese this also inserts spaces between adjacent Han characters. 00092 $strippedVariants = array_map( 00093 array( $wgContLang, 'normalizeForSearch' ), 00094 $variants ); 00095 00096 // Some languages such as Chinese force all variants to a canonical 00097 // form when stripping to the low-level search index, so to be sure 00098 // let's check our variants list for unique items after stripping. 00099 $strippedVariants = array_unique( $strippedVariants ); 00100 00101 $searchon .= $modifier; 00102 if( count( $strippedVariants) > 1 ) 00103 $searchon .= '('; 00104 foreach( $strippedVariants as $stripped ) { 00105 $stripped = $this->normalizeText( $stripped ); 00106 if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { 00107 // Hack for Chinese: we need to toss in quotes for 00108 // multiple-character phrases since normalizeForSearch() 00109 // added spaces between them to make word breaks. 00110 $stripped = '"' . trim( $stripped ) . '"'; 00111 } 00112 $searchon .= "$quote$stripped$quote$wildcard "; 00113 } 00114 if( count( $strippedVariants) > 1 ) 00115 $searchon .= ')'; 00116 00117 // Match individual terms or quoted phrase in result highlighting... 00118 // Note that variants will be introduced in a later stage for highlighting! 00119 $regexp = $this->regexTerm( $term, $wildcard ); 00120 $this->searchTerms[] = $regexp; 00121 } 00122 wfDebug( __METHOD__ . ": Would search with '$searchon'\n" ); 00123 wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); 00124 } else { 00125 wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); 00126 } 00127 00128 $searchon = $this->db->strencode( $searchon ); 00129 $field = $this->getIndexField( $fulltext ); 00130 return " MATCH($field) AGAINST('$searchon' IN BOOLEAN MODE) "; 00131 } 00132 00133 function regexTerm( $string, $wildcard ) { 00134 global $wgContLang; 00135 00136 $regex = preg_quote( $string, '/' ); 00137 if( $wgContLang->hasWordBreaks() ) { 00138 if( $wildcard ) { 00139 // Don't cut off the final bit! 00140 $regex = "\b$regex"; 00141 } else { 00142 $regex = "\b$regex\b"; 00143 } 00144 } else { 00145 // For Chinese, words may legitimately abut other words in the text literal. 00146 // Don't add \b boundary checks... note this could cause false positives 00147 // for latin chars. 00148 } 00149 return $regex; 00150 } 00151 00152 public static function legalSearchChars() { 00153 return "\"*" . parent::legalSearchChars(); 00154 } 00155 00162 function searchText( $term ) { 00163 return $this->searchInternal( $term, true ); 00164 } 00165 00172 function searchTitle( $term ) { 00173 return $this->searchInternal( $term, false ); 00174 } 00175 00176 protected function searchInternal( $term, $fulltext ) { 00177 global $wgCountTotalSearchHits; 00178 00179 // This seems out of place, why is this called with empty term? 00180 if ( trim( $term ) === '' ) return null; 00181 00182 $filteredTerm = $this->filter( $term ); 00183 $query = $this->getQuery( $filteredTerm, $fulltext ); 00184 $resultSet = $this->db->select( 00185 $query['tables'], $query['fields'], $query['conds'], 00186 __METHOD__, $query['options'], $query['joins'] 00187 ); 00188 00189 $total = null; 00190 if( $wgCountTotalSearchHits ) { 00191 $query = $this->getCountQuery( $filteredTerm, $fulltext ); 00192 $totalResult = $this->db->select( 00193 $query['tables'], $query['fields'], $query['conds'], 00194 __METHOD__, $query['options'], $query['joins'] 00195 ); 00196 00197 $row = $totalResult->fetchObject(); 00198 if( $row ) { 00199 $total = intval( $row->c ); 00200 } 00201 $totalResult->free(); 00202 } 00203 00204 return new MySQLSearchResultSet( $resultSet, $this->searchTerms, $total ); 00205 } 00206 00207 public function supports( $feature ) { 00208 switch( $feature ) { 00209 case 'list-redirects': 00210 case 'title-suffix-filter': 00211 return true; 00212 default: 00213 return false; 00214 } 00215 } 00216 00222 protected function queryFeatures( &$query ) { 00223 foreach ( $this->features as $feature => $value ) { 00224 if ( $feature === 'list-redirects' && !$value ) { 00225 $query['conds']['page_is_redirect'] = 0; 00226 } elseif( $feature === 'title-suffix-filter' && $value ) { 00227 $query['conds'][] = 'page_title' . $this->db->buildLike( $this->db->anyString(), $value ); 00228 } 00229 } 00230 } 00231 00237 function queryNamespaces( &$query ) { 00238 if ( is_array( $this->namespaces ) ) { 00239 if ( count( $this->namespaces ) === 0 ) { 00240 $this->namespaces[] = '0'; 00241 } 00242 $query['conds']['page_namespace'] = $this->namespaces; 00243 } 00244 } 00245 00251 protected function limitResult( &$query ) { 00252 $query['options']['LIMIT'] = $this->limit; 00253 $query['options']['OFFSET'] = $this->offset; 00254 } 00255 00264 function getQuery( $filteredTerm, $fulltext ) { 00265 $query = array( 00266 'tables' => array(), 00267 'fields' => array(), 00268 'conds' => array(), 00269 'options' => array(), 00270 'joins' => array(), 00271 ); 00272 00273 $this->queryMain( $query, $filteredTerm, $fulltext ); 00274 $this->queryFeatures( $query ); 00275 $this->queryNamespaces( $query ); 00276 $this->limitResult( $query ); 00277 00278 return $query; 00279 } 00280 00286 function getIndexField( $fulltext ) { 00287 return $fulltext ? 'si_text' : 'si_title'; 00288 } 00289 00298 function queryMain( &$query, $filteredTerm, $fulltext ) { 00299 $match = $this->parseQuery( $filteredTerm, $fulltext ); 00300 $query['tables'][] = 'page'; 00301 $query['tables'][] = 'searchindex'; 00302 $query['fields'][] = 'page_id'; 00303 $query['fields'][] = 'page_namespace'; 00304 $query['fields'][] = 'page_title'; 00305 $query['conds'][] = 'page_id=si_page'; 00306 $query['conds'][] = $match; 00307 } 00308 00312 function getCountQuery( $filteredTerm, $fulltext ) { 00313 $match = $this->parseQuery( $filteredTerm, $fulltext ); 00314 00315 $query = array( 00316 'tables' => array( 'page', 'searchindex' ), 00317 'fields' => array( 'COUNT(*) as c' ), 00318 'conds' => array( 'page_id=si_page', $match ), 00319 'options' => array(), 00320 'joins' => array(), 00321 ); 00322 00323 $this->queryFeatures( $query ); 00324 $this->queryNamespaces( $query ); 00325 00326 return $query; 00327 } 00328 00337 function update( $id, $title, $text ) { 00338 $dbw = wfGetDB( DB_MASTER ); 00339 $dbw->replace( 'searchindex', 00340 array( 'si_page' ), 00341 array( 00342 'si_page' => $id, 00343 'si_title' => $this->normalizeText( $title ), 00344 'si_text' => $this->normalizeText( $text ) 00345 ), __METHOD__ ); 00346 } 00347 00355 function updateTitle( $id, $title ) { 00356 $dbw = wfGetDB( DB_MASTER ); 00357 00358 $dbw->update( 'searchindex', 00359 array( 'si_title' => $this->normalizeText( $title ) ), 00360 array( 'si_page' => $id ), 00361 __METHOD__, 00362 array( $dbw->lowPriorityOption() ) ); 00363 } 00364 00369 function normalizeText( $string ) { 00370 global $wgContLang; 00371 00372 wfProfileIn( __METHOD__ ); 00373 00374 $out = parent::normalizeText( $string ); 00375 00376 // MySQL fulltext index doesn't grok utf-8, so we 00377 // need to fold cases and convert to hex 00378 $out = preg_replace_callback( 00379 "/([\\xc0-\\xff][\\x80-\\xbf]*)/", 00380 array( $this, 'stripForSearchCallback' ), 00381 $wgContLang->lc( $out ) ); 00382 00383 // And to add insult to injury, the default indexing 00384 // ignores short words... Pad them so we can pass them 00385 // through without reconfiguring the server... 00386 $minLength = $this->minSearchLength(); 00387 if( $minLength > 1 ) { 00388 $n = $minLength - 1; 00389 $out = preg_replace( 00390 "/\b(\w{1,$n})\b/", 00391 "$1u800", 00392 $out ); 00393 } 00394 00395 // Periods within things like hostnames and IP addresses 00396 // are also important -- we want a search for "example.com" 00397 // or "192.168.1.1" to work sanely. 00398 // 00399 // MySQL's search seems to ignore them, so you'd match on 00400 // "example.wikipedia.com" and "192.168.83.1" as well. 00401 $out = preg_replace( 00402 "/(\w)\.(\w|\*)/u", 00403 "$1u82e$2", 00404 $out ); 00405 00406 wfProfileOut( __METHOD__ ); 00407 00408 return $out; 00409 } 00410 00416 protected function stripForSearchCallback( $matches ) { 00417 return 'u8' . bin2hex( $matches[1] ); 00418 } 00419 00426 protected function minSearchLength() { 00427 if( is_null( self::$mMinSearchLength ) ) { 00428 $sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'"; 00429 00430 $dbr = wfGetDB( DB_SLAVE ); 00431 $result = $dbr->query( $sql ); 00432 $row = $result->fetchObject(); 00433 $result->free(); 00434 00435 if( $row && $row->Variable_name == 'ft_min_word_len' ) { 00436 self::$mMinSearchLength = intval( $row->Value ); 00437 } else { 00438 self::$mMinSearchLength = 0; 00439 } 00440 } 00441 return self::$mMinSearchLength; 00442 } 00443 } 00444 00448 class MySQLSearchResultSet extends SqlSearchResultSet { 00449 function __construct( $resultSet, $terms, $totalHits=null ) { 00450 parent::__construct( $resultSet, $terms ); 00451 $this->mTotalHits = $totalHits; 00452 } 00453 00454 function getTotalHits() { 00455 return $this->mTotalHits; 00456 } 00457 }