MediaWiki
REL1_22
|
00001 <?php 00031 class SearchMySQL extends SearchEngine { 00032 var $strictMatching = true; 00033 static $mMinSearchLength; 00034 00039 function __construct( $db ) { 00040 parent::__construct( $db ); 00041 } 00042 00052 function parseQuery( $filteredText, $fulltext ) { 00053 global $wgContLang; 00054 $lc = SearchEngine::legalSearchChars(); // Minus format chars 00055 $searchon = ''; 00056 $this->searchTerms = array(); 00057 00058 # @todo FIXME: This doesn't handle parenthetical expressions. 00059 $m = array(); 00060 if ( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', 00061 $filteredText, $m, PREG_SET_ORDER ) ) { 00062 foreach ( $m as $bits ) { 00063 @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits; 00064 00065 if ( $nonQuoted != '' ) { 00066 $term = $nonQuoted; 00067 $quote = ''; 00068 } else { 00069 $term = str_replace( '"', '', $term ); 00070 $quote = '"'; 00071 } 00072 00073 if ( $searchon !== '' ) { 00074 $searchon .= ' '; 00075 } 00076 if ( $this->strictMatching && ( $modifier == '' ) ) { 00077 // If we leave this out, boolean op defaults to OR which is rarely helpful. 00078 $modifier = '+'; 00079 } 00080 00081 // Some languages such as Serbian store the input form in the search index, 00082 // so we may need to search for matches in multiple writing system variants. 00083 $convertedVariants = $wgContLang->autoConvertToAllVariants( $term ); 00084 if ( is_array( $convertedVariants ) ) { 00085 $variants = array_unique( array_values( $convertedVariants ) ); 00086 } else { 00087 $variants = array( $term ); 00088 } 00089 00090 // The low-level search index does some processing on input to work 00091 // around problems with minimum lengths and encoding in MySQL's 00092 // fulltext engine. 00093 // For Chinese this also inserts spaces between adjacent Han characters. 00094 $strippedVariants = array_map( 00095 array( $wgContLang, 'normalizeForSearch' ), 00096 $variants ); 00097 00098 // Some languages such as Chinese force all variants to a canonical 00099 // form when stripping to the low-level search index, so to be sure 00100 // let's check our variants list for unique items after stripping. 00101 $strippedVariants = array_unique( $strippedVariants ); 00102 00103 $searchon .= $modifier; 00104 if ( count( $strippedVariants ) > 1 ) { 00105 $searchon .= '('; 00106 } 00107 foreach ( $strippedVariants as $stripped ) { 00108 $stripped = $this->normalizeText( $stripped ); 00109 if ( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { 00110 // Hack for Chinese: we need to toss in quotes for 00111 // multiple-character phrases since normalizeForSearch() 00112 // added spaces between them to make word breaks. 00113 $stripped = '"' . trim( $stripped ) . '"'; 00114 } 00115 $searchon .= "$quote$stripped$quote$wildcard "; 00116 } 00117 if ( count( $strippedVariants ) > 1 ) { 00118 $searchon .= ')'; 00119 } 00120 00121 // Match individual terms or quoted phrase in result highlighting... 00122 // Note that variants will be introduced in a later stage for highlighting! 00123 $regexp = $this->regexTerm( $term, $wildcard ); 00124 $this->searchTerms[] = $regexp; 00125 } 00126 wfDebug( __METHOD__ . ": Would search with '$searchon'\n" ); 00127 wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); 00128 } else { 00129 wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); 00130 } 00131 00132 $searchon = $this->db->strencode( $searchon ); 00133 $field = $this->getIndexField( $fulltext ); 00134 return " MATCH($field) AGAINST('$searchon' IN BOOLEAN MODE) "; 00135 } 00136 00137 function regexTerm( $string, $wildcard ) { 00138 global $wgContLang; 00139 00140 $regex = preg_quote( $string, '/' ); 00141 if ( $wgContLang->hasWordBreaks() ) { 00142 if ( $wildcard ) { 00143 // Don't cut off the final bit! 00144 $regex = "\b$regex"; 00145 } else { 00146 $regex = "\b$regex\b"; 00147 } 00148 } else { 00149 // For Chinese, words may legitimately abut other words in the text literal. 00150 // Don't add \b boundary checks... note this could cause false positives 00151 // for latin chars. 00152 } 00153 return $regex; 00154 } 00155 00156 public static function legalSearchChars() { 00157 return "\"*" . parent::legalSearchChars(); 00158 } 00159 00166 function searchText( $term ) { 00167 return $this->searchInternal( $term, true ); 00168 } 00169 00176 function searchTitle( $term ) { 00177 return $this->searchInternal( $term, false ); 00178 } 00179 00180 protected function searchInternal( $term, $fulltext ) { 00181 global $wgCountTotalSearchHits; 00182 00183 // This seems out of place, why is this called with empty term? 00184 if ( trim( $term ) === '' ) { 00185 return null; 00186 } 00187 00188 $filteredTerm = $this->filter( $term ); 00189 $query = $this->getQuery( $filteredTerm, $fulltext ); 00190 $resultSet = $this->db->select( 00191 $query['tables'], $query['fields'], $query['conds'], 00192 __METHOD__, $query['options'], $query['joins'] 00193 ); 00194 00195 $total = null; 00196 if ( $wgCountTotalSearchHits ) { 00197 $query = $this->getCountQuery( $filteredTerm, $fulltext ); 00198 $totalResult = $this->db->select( 00199 $query['tables'], $query['fields'], $query['conds'], 00200 __METHOD__, $query['options'], $query['joins'] 00201 ); 00202 00203 $row = $totalResult->fetchObject(); 00204 if ( $row ) { 00205 $total = intval( $row->c ); 00206 } 00207 $totalResult->free(); 00208 } 00209 00210 return new MySQLSearchResultSet( $resultSet, $this->searchTerms, $total ); 00211 } 00212 00213 public function supports( $feature ) { 00214 switch ( $feature ) { 00215 case 'title-suffix-filter': 00216 return true; 00217 default: 00218 return parent::supports( $feature ); 00219 } 00220 } 00221 00227 protected function queryFeatures( &$query ) { 00228 foreach ( $this->features as $feature => $value ) { 00229 if ( $feature === 'list-redirects' && !$value ) { 00230 $query['conds']['page_is_redirect'] = 0; 00231 } elseif ( $feature === 'title-suffix-filter' && $value ) { 00232 $query['conds'][] = 'page_title' . $this->db->buildLike( $this->db->anyString(), $value ); 00233 } 00234 } 00235 } 00236 00242 function queryNamespaces( &$query ) { 00243 if ( is_array( $this->namespaces ) ) { 00244 if ( count( $this->namespaces ) === 0 ) { 00245 $this->namespaces[] = '0'; 00246 } 00247 $query['conds']['page_namespace'] = $this->namespaces; 00248 } 00249 } 00250 00256 protected function limitResult( &$query ) { 00257 $query['options']['LIMIT'] = $this->limit; 00258 $query['options']['OFFSET'] = $this->offset; 00259 } 00260 00269 function getQuery( $filteredTerm, $fulltext ) { 00270 $query = array( 00271 'tables' => array(), 00272 'fields' => array(), 00273 'conds' => array(), 00274 'options' => array(), 00275 'joins' => array(), 00276 ); 00277 00278 $this->queryMain( $query, $filteredTerm, $fulltext ); 00279 $this->queryFeatures( $query ); 00280 $this->queryNamespaces( $query ); 00281 $this->limitResult( $query ); 00282 00283 return $query; 00284 } 00285 00291 function getIndexField( $fulltext ) { 00292 return $fulltext ? 'si_text' : 'si_title'; 00293 } 00294 00303 function queryMain( &$query, $filteredTerm, $fulltext ) { 00304 $match = $this->parseQuery( $filteredTerm, $fulltext ); 00305 $query['tables'][] = 'page'; 00306 $query['tables'][] = 'searchindex'; 00307 $query['fields'][] = 'page_id'; 00308 $query['fields'][] = 'page_namespace'; 00309 $query['fields'][] = 'page_title'; 00310 $query['conds'][] = 'page_id=si_page'; 00311 $query['conds'][] = $match; 00312 } 00313 00318 function getCountQuery( $filteredTerm, $fulltext ) { 00319 $match = $this->parseQuery( $filteredTerm, $fulltext ); 00320 00321 $query = array( 00322 'tables' => array( 'page', 'searchindex' ), 00323 'fields' => array( 'COUNT(*) as c' ), 00324 'conds' => array( 'page_id=si_page', $match ), 00325 'options' => array(), 00326 'joins' => array(), 00327 ); 00328 00329 $this->queryFeatures( $query ); 00330 $this->queryNamespaces( $query ); 00331 00332 return $query; 00333 } 00334 00343 function update( $id, $title, $text ) { 00344 $dbw = wfGetDB( DB_MASTER ); 00345 $dbw->replace( 'searchindex', 00346 array( 'si_page' ), 00347 array( 00348 'si_page' => $id, 00349 'si_title' => $this->normalizeText( $title ), 00350 'si_text' => $this->normalizeText( $text ) 00351 ), __METHOD__ ); 00352 } 00353 00361 function updateTitle( $id, $title ) { 00362 $dbw = wfGetDB( DB_MASTER ); 00363 00364 $dbw->update( 'searchindex', 00365 array( 'si_title' => $this->normalizeText( $title ) ), 00366 array( 'si_page' => $id ), 00367 __METHOD__, 00368 array( $dbw->lowPriorityOption() ) ); 00369 } 00370 00378 function delete( $id, $title ) { 00379 $dbw = wfGetDB( DB_MASTER ); 00380 00381 $dbw->delete( 'searchindex', array( 'si_page' => $id ), __METHOD__ ); 00382 } 00383 00389 function normalizeText( $string ) { 00390 global $wgContLang; 00391 00392 wfProfileIn( __METHOD__ ); 00393 00394 $out = parent::normalizeText( $string ); 00395 00396 // MySQL fulltext index doesn't grok utf-8, so we 00397 // need to fold cases and convert to hex 00398 $out = preg_replace_callback( 00399 "/([\\xc0-\\xff][\\x80-\\xbf]*)/", 00400 array( $this, 'stripForSearchCallback' ), 00401 $wgContLang->lc( $out ) ); 00402 00403 // And to add insult to injury, the default indexing 00404 // ignores short words... Pad them so we can pass them 00405 // through without reconfiguring the server... 00406 $minLength = $this->minSearchLength(); 00407 if ( $minLength > 1 ) { 00408 $n = $minLength - 1; 00409 $out = preg_replace( 00410 "/\b(\w{1,$n})\b/", 00411 "$1u800", 00412 $out ); 00413 } 00414 00415 // Periods within things like hostnames and IP addresses 00416 // are also important -- we want a search for "example.com" 00417 // or "192.168.1.1" to work sanely. 00418 // 00419 // MySQL's search seems to ignore them, so you'd match on 00420 // "example.wikipedia.com" and "192.168.83.1" as well. 00421 $out = preg_replace( 00422 "/(\w)\.(\w|\*)/u", 00423 "$1u82e$2", 00424 $out ); 00425 00426 wfProfileOut( __METHOD__ ); 00427 00428 return $out; 00429 } 00430 00437 protected function stripForSearchCallback( $matches ) { 00438 return 'u8' . bin2hex( $matches[1] ); 00439 } 00440 00447 protected function minSearchLength() { 00448 if ( is_null( self::$mMinSearchLength ) ) { 00449 $sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'"; 00450 00451 $dbr = wfGetDB( DB_SLAVE ); 00452 $result = $dbr->query( $sql ); 00453 $row = $result->fetchObject(); 00454 $result->free(); 00455 00456 if ( $row && $row->Variable_name == 'ft_min_word_len' ) { 00457 self::$mMinSearchLength = intval( $row->Value ); 00458 } else { 00459 self::$mMinSearchLength = 0; 00460 } 00461 } 00462 return self::$mMinSearchLength; 00463 } 00464 } 00465 00469 class MySQLSearchResultSet extends SqlSearchResultSet { 00470 function __construct( $resultSet, $terms, $totalHits = null ) { 00471 parent::__construct( $resultSet, $terms ); 00472 $this->mTotalHits = $totalHits; 00473 } 00474 00475 function getTotalHits() { 00476 return $this->mTotalHits; 00477 } 00478 }