MediaWiki
REL1_23
|
00001 <?php 00031 class SearchMySQL extends SearchDatabase { 00032 var $strictMatching = true; 00033 static $mMinSearchLength; 00034 00044 function parseQuery( $filteredText, $fulltext ) { 00045 global $wgContLang; 00046 $lc = SearchEngine::legalSearchChars(); // Minus format chars 00047 $searchon = ''; 00048 $this->searchTerms = array(); 00049 00050 # @todo FIXME: This doesn't handle parenthetical expressions. 00051 $m = array(); 00052 if ( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', 00053 $filteredText, $m, PREG_SET_ORDER ) ) { 00054 foreach ( $m as $bits ) { 00055 @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits; 00056 00057 if ( $nonQuoted != '' ) { 00058 $term = $nonQuoted; 00059 $quote = ''; 00060 } else { 00061 $term = str_replace( '"', '', $term ); 00062 $quote = '"'; 00063 } 00064 00065 if ( $searchon !== '' ) { 00066 $searchon .= ' '; 00067 } 00068 if ( $this->strictMatching && ( $modifier == '' ) ) { 00069 // If we leave this out, boolean op defaults to OR which is rarely helpful. 00070 $modifier = '+'; 00071 } 00072 00073 // Some languages such as Serbian store the input form in the search index, 00074 // so we may need to search for matches in multiple writing system variants. 00075 $convertedVariants = $wgContLang->autoConvertToAllVariants( $term ); 00076 if ( is_array( $convertedVariants ) ) { 00077 $variants = array_unique( array_values( $convertedVariants ) ); 00078 } else { 00079 $variants = array( $term ); 00080 } 00081 00082 // The low-level search index does some processing on input to work 00083 // around problems with minimum lengths and encoding in MySQL's 00084 // fulltext engine. 00085 // For Chinese this also inserts spaces between adjacent Han characters. 00086 $strippedVariants = array_map( 00087 array( $wgContLang, 'normalizeForSearch' ), 00088 $variants ); 00089 00090 // Some languages such as Chinese force all variants to a canonical 00091 // form when stripping to the low-level search index, so to be sure 00092 // let's check our variants list for unique items after stripping. 00093 $strippedVariants = array_unique( $strippedVariants ); 00094 00095 $searchon .= $modifier; 00096 if ( count( $strippedVariants ) > 1 ) { 00097 $searchon .= '('; 00098 } 00099 foreach ( $strippedVariants as $stripped ) { 00100 $stripped = $this->normalizeText( $stripped ); 00101 if ( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { 00102 // Hack for Chinese: we need to toss in quotes for 00103 // multiple-character phrases since normalizeForSearch() 00104 // added spaces between them to make word breaks. 00105 $stripped = '"' . trim( $stripped ) . '"'; 00106 } 00107 $searchon .= "$quote$stripped$quote$wildcard "; 00108 } 00109 if ( count( $strippedVariants ) > 1 ) { 00110 $searchon .= ')'; 00111 } 00112 00113 // Match individual terms or quoted phrase in result highlighting... 00114 // Note that variants will be introduced in a later stage for highlighting! 00115 $regexp = $this->regexTerm( $term, $wildcard ); 00116 $this->searchTerms[] = $regexp; 00117 } 00118 wfDebug( __METHOD__ . ": Would search with '$searchon'\n" ); 00119 wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); 00120 } else { 00121 wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); 00122 } 00123 00124 $searchon = $this->db->strencode( $searchon ); 00125 $field = $this->getIndexField( $fulltext ); 00126 return " MATCH($field) AGAINST('$searchon' IN BOOLEAN MODE) "; 00127 } 00128 00129 function regexTerm( $string, $wildcard ) { 00130 global $wgContLang; 00131 00132 $regex = preg_quote( $string, '/' ); 00133 if ( $wgContLang->hasWordBreaks() ) { 00134 if ( $wildcard ) { 00135 // Don't cut off the final bit! 00136 $regex = "\b$regex"; 00137 } else { 00138 $regex = "\b$regex\b"; 00139 } 00140 } else { 00141 // For Chinese, words may legitimately abut other words in the text literal. 00142 // Don't add \b boundary checks... note this could cause false positives 00143 // for latin chars. 00144 } 00145 return $regex; 00146 } 00147 00148 public static function legalSearchChars() { 00149 return "\"*" . parent::legalSearchChars(); 00150 } 00151 00158 function searchText( $term ) { 00159 return $this->searchInternal( $term, true ); 00160 } 00161 00168 function searchTitle( $term ) { 00169 return $this->searchInternal( $term, false ); 00170 } 00171 00172 protected function searchInternal( $term, $fulltext ) { 00173 global $wgCountTotalSearchHits; 00174 00175 // This seems out of place, why is this called with empty term? 00176 if ( trim( $term ) === '' ) { 00177 return null; 00178 } 00179 00180 $filteredTerm = $this->filter( $term ); 00181 $query = $this->getQuery( $filteredTerm, $fulltext ); 00182 $resultSet = $this->db->select( 00183 $query['tables'], $query['fields'], $query['conds'], 00184 __METHOD__, $query['options'], $query['joins'] 00185 ); 00186 00187 $total = null; 00188 if ( $wgCountTotalSearchHits ) { 00189 $query = $this->getCountQuery( $filteredTerm, $fulltext ); 00190 $totalResult = $this->db->select( 00191 $query['tables'], $query['fields'], $query['conds'], 00192 __METHOD__, $query['options'], $query['joins'] 00193 ); 00194 00195 $row = $totalResult->fetchObject(); 00196 if ( $row ) { 00197 $total = intval( $row->c ); 00198 } 00199 $totalResult->free(); 00200 } 00201 00202 return new MySQLSearchResultSet( $resultSet, $this->searchTerms, $total ); 00203 } 00204 00205 public function supports( $feature ) { 00206 switch ( $feature ) { 00207 case 'title-suffix-filter': 00208 return true; 00209 default: 00210 return parent::supports( $feature ); 00211 } 00212 } 00213 00219 protected function queryFeatures( &$query ) { 00220 foreach ( $this->features as $feature => $value ) { 00221 if ( $feature === 'title-suffix-filter' && $value ) { 00222 $query['conds'][] = 'page_title' . $this->db->buildLike( $this->db->anyString(), $value ); 00223 } 00224 } 00225 } 00226 00232 function queryNamespaces( &$query ) { 00233 if ( is_array( $this->namespaces ) ) { 00234 if ( count( $this->namespaces ) === 0 ) { 00235 $this->namespaces[] = '0'; 00236 } 00237 $query['conds']['page_namespace'] = $this->namespaces; 00238 } 00239 } 00240 00246 protected function limitResult( &$query ) { 00247 $query['options']['LIMIT'] = $this->limit; 00248 $query['options']['OFFSET'] = $this->offset; 00249 } 00250 00259 function getQuery( $filteredTerm, $fulltext ) { 00260 $query = array( 00261 'tables' => array(), 00262 'fields' => array(), 00263 'conds' => array(), 00264 'options' => array(), 00265 'joins' => array(), 00266 ); 00267 00268 $this->queryMain( $query, $filteredTerm, $fulltext ); 00269 $this->queryFeatures( $query ); 00270 $this->queryNamespaces( $query ); 00271 $this->limitResult( $query ); 00272 00273 return $query; 00274 } 00275 00281 function getIndexField( $fulltext ) { 00282 return $fulltext ? 'si_text' : 'si_title'; 00283 } 00284 00293 function queryMain( &$query, $filteredTerm, $fulltext ) { 00294 $match = $this->parseQuery( $filteredTerm, $fulltext ); 00295 $query['tables'][] = 'page'; 00296 $query['tables'][] = 'searchindex'; 00297 $query['fields'][] = 'page_id'; 00298 $query['fields'][] = 'page_namespace'; 00299 $query['fields'][] = 'page_title'; 00300 $query['conds'][] = 'page_id=si_page'; 00301 $query['conds'][] = $match; 00302 } 00303 00308 function getCountQuery( $filteredTerm, $fulltext ) { 00309 $match = $this->parseQuery( $filteredTerm, $fulltext ); 00310 00311 $query = array( 00312 'tables' => array( 'page', 'searchindex' ), 00313 'fields' => array( 'COUNT(*) as c' ), 00314 'conds' => array( 'page_id=si_page', $match ), 00315 'options' => array(), 00316 'joins' => array(), 00317 ); 00318 00319 $this->queryFeatures( $query ); 00320 $this->queryNamespaces( $query ); 00321 00322 return $query; 00323 } 00324 00333 function update( $id, $title, $text ) { 00334 $dbw = wfGetDB( DB_MASTER ); 00335 $dbw->replace( 'searchindex', 00336 array( 'si_page' ), 00337 array( 00338 'si_page' => $id, 00339 'si_title' => $this->normalizeText( $title ), 00340 'si_text' => $this->normalizeText( $text ) 00341 ), __METHOD__ ); 00342 } 00343 00351 function updateTitle( $id, $title ) { 00352 $dbw = wfGetDB( DB_MASTER ); 00353 00354 $dbw->update( 'searchindex', 00355 array( 'si_title' => $this->normalizeText( $title ) ), 00356 array( 'si_page' => $id ), 00357 __METHOD__, 00358 array( $dbw->lowPriorityOption() ) ); 00359 } 00360 00368 function delete( $id, $title ) { 00369 $dbw = wfGetDB( DB_MASTER ); 00370 00371 $dbw->delete( 'searchindex', array( 'si_page' => $id ), __METHOD__ ); 00372 } 00373 00379 function normalizeText( $string ) { 00380 global $wgContLang; 00381 00382 wfProfileIn( __METHOD__ ); 00383 00384 $out = parent::normalizeText( $string ); 00385 00386 // MySQL fulltext index doesn't grok utf-8, so we 00387 // need to fold cases and convert to hex 00388 $out = preg_replace_callback( 00389 "/([\\xc0-\\xff][\\x80-\\xbf]*)/", 00390 array( $this, 'stripForSearchCallback' ), 00391 $wgContLang->lc( $out ) ); 00392 00393 // And to add insult to injury, the default indexing 00394 // ignores short words... Pad them so we can pass them 00395 // through without reconfiguring the server... 00396 $minLength = $this->minSearchLength(); 00397 if ( $minLength > 1 ) { 00398 $n = $minLength - 1; 00399 $out = preg_replace( 00400 "/\b(\w{1,$n})\b/", 00401 "$1u800", 00402 $out ); 00403 } 00404 00405 // Periods within things like hostnames and IP addresses 00406 // are also important -- we want a search for "example.com" 00407 // or "192.168.1.1" to work sanely. 00408 // 00409 // MySQL's search seems to ignore them, so you'd match on 00410 // "example.wikipedia.com" and "192.168.83.1" as well. 00411 $out = preg_replace( 00412 "/(\w)\.(\w|\*)/u", 00413 "$1u82e$2", 00414 $out ); 00415 00416 wfProfileOut( __METHOD__ ); 00417 00418 return $out; 00419 } 00420 00427 protected function stripForSearchCallback( $matches ) { 00428 return 'u8' . bin2hex( $matches[1] ); 00429 } 00430 00437 protected function minSearchLength() { 00438 if ( is_null( self::$mMinSearchLength ) ) { 00439 $sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'"; 00440 00441 $dbr = wfGetDB( DB_SLAVE ); 00442 $result = $dbr->query( $sql ); 00443 $row = $result->fetchObject(); 00444 $result->free(); 00445 00446 if ( $row && $row->Variable_name == 'ft_min_word_len' ) { 00447 self::$mMinSearchLength = intval( $row->Value ); 00448 } else { 00449 self::$mMinSearchLength = 0; 00450 } 00451 } 00452 return self::$mMinSearchLength; 00453 } 00454 } 00455 00459 class MySQLSearchResultSet extends SqlSearchResultSet { 00460 function __construct( $resultSet, $terms, $totalHits = null ) { 00461 parent::__construct( $resultSet, $terms ); 00462 $this->mTotalHits = $totalHits; 00463 } 00464 00465 function getTotalHits() { 00466 return $this->mTotalHits; 00467 } 00468 }