MediaWiki
REL1_24
|
00001 <?php 00031 class SearchMySQL extends SearchDatabase { 00032 protected $strictMatching = true; 00033 00034 private static $mMinSearchLength; 00035 00045 function parseQuery( $filteredText, $fulltext ) { 00046 global $wgContLang; 00047 00048 $lc = $this->legalSearchChars(); // Minus format chars 00049 $searchon = ''; 00050 $this->searchTerms = array(); 00051 00052 # @todo FIXME: This doesn't handle parenthetical expressions. 00053 $m = array(); 00054 if ( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', 00055 $filteredText, $m, PREG_SET_ORDER ) ) { 00056 foreach ( $m as $bits ) { 00057 wfSuppressWarnings(); 00058 list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits; 00059 wfRestoreWarnings(); 00060 00061 if ( $nonQuoted != '' ) { 00062 $term = $nonQuoted; 00063 $quote = ''; 00064 } else { 00065 $term = str_replace( '"', '', $term ); 00066 $quote = '"'; 00067 } 00068 00069 if ( $searchon !== '' ) { 00070 $searchon .= ' '; 00071 } 00072 if ( $this->strictMatching && ( $modifier == '' ) ) { 00073 // If we leave this out, boolean op defaults to OR which is rarely helpful. 00074 $modifier = '+'; 00075 } 00076 00077 // Some languages such as Serbian store the input form in the search index, 00078 // so we may need to search for matches in multiple writing system variants. 00079 $convertedVariants = $wgContLang->autoConvertToAllVariants( $term ); 00080 if ( is_array( $convertedVariants ) ) { 00081 $variants = array_unique( array_values( $convertedVariants ) ); 00082 } else { 00083 $variants = array( $term ); 00084 } 00085 00086 // The low-level search index does some processing on input to work 00087 // around problems with minimum lengths and encoding in MySQL's 00088 // fulltext engine. 00089 // For Chinese this also inserts spaces between adjacent Han characters. 00090 $strippedVariants = array_map( 00091 array( $wgContLang, 'normalizeForSearch' ), 00092 $variants ); 00093 00094 // Some languages such as Chinese force all variants to a canonical 00095 // form when stripping to the low-level search index, so to be sure 00096 // let's check our variants list for unique items after stripping. 00097 $strippedVariants = array_unique( $strippedVariants ); 00098 00099 $searchon .= $modifier; 00100 if ( count( $strippedVariants ) > 1 ) { 00101 $searchon .= '('; 00102 } 00103 foreach ( $strippedVariants as $stripped ) { 00104 $stripped = $this->normalizeText( $stripped ); 00105 if ( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { 00106 // Hack for Chinese: we need to toss in quotes for 00107 // multiple-character phrases since normalizeForSearch() 00108 // added spaces between them to make word breaks. 00109 $stripped = '"' . trim( $stripped ) . '"'; 00110 } 00111 $searchon .= "$quote$stripped$quote$wildcard "; 00112 } 00113 if ( count( $strippedVariants ) > 1 ) { 00114 $searchon .= ')'; 00115 } 00116 00117 // Match individual terms or quoted phrase in result highlighting... 00118 // Note that variants will be introduced in a later stage for highlighting! 00119 $regexp = $this->regexTerm( $term, $wildcard ); 00120 $this->searchTerms[] = $regexp; 00121 } 00122 wfDebug( __METHOD__ . ": Would search with '$searchon'\n" ); 00123 wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); 00124 } else { 00125 wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); 00126 } 00127 00128 $searchon = $this->db->addQuotes( $searchon ); 00129 $field = $this->getIndexField( $fulltext ); 00130 return " MATCH($field) AGAINST($searchon IN BOOLEAN MODE) "; 00131 } 00132 00133 function regexTerm( $string, $wildcard ) { 00134 global $wgContLang; 00135 00136 $regex = preg_quote( $string, '/' ); 00137 if ( $wgContLang->hasWordBreaks() ) { 00138 if ( $wildcard ) { 00139 // Don't cut off the final bit! 00140 $regex = "\b$regex"; 00141 } else { 00142 $regex = "\b$regex\b"; 00143 } 00144 } else { 00145 // For Chinese, words may legitimately abut other words in the text literal. 00146 // Don't add \b boundary checks... note this could cause false positives 00147 // for latin chars. 00148 } 00149 return $regex; 00150 } 00151 00152 public static function legalSearchChars() { 00153 return "\"*" . parent::legalSearchChars(); 00154 } 00155 00162 function searchText( $term ) { 00163 return $this->searchInternal( $term, true ); 00164 } 00165 00172 function searchTitle( $term ) { 00173 return $this->searchInternal( $term, false ); 00174 } 00175 00176 protected function searchInternal( $term, $fulltext ) { 00177 // This seems out of place, why is this called with empty term? 00178 if ( trim( $term ) === '' ) { 00179 return null; 00180 } 00181 00182 $filteredTerm = $this->filter( $term ); 00183 $query = $this->getQuery( $filteredTerm, $fulltext ); 00184 $resultSet = $this->db->select( 00185 $query['tables'], $query['fields'], $query['conds'], 00186 __METHOD__, $query['options'], $query['joins'] 00187 ); 00188 00189 $total = null; 00190 $query = $this->getCountQuery( $filteredTerm, $fulltext ); 00191 $totalResult = $this->db->select( 00192 $query['tables'], $query['fields'], $query['conds'], 00193 __METHOD__, $query['options'], $query['joins'] 00194 ); 00195 00196 $row = $totalResult->fetchObject(); 00197 if ( $row ) { 00198 $total = intval( $row->c ); 00199 } 00200 $totalResult->free(); 00201 00202 return new SqlSearchResultSet( $resultSet, $this->searchTerms, $total ); 00203 } 00204 00205 public function supports( $feature ) { 00206 switch ( $feature ) { 00207 case 'title-suffix-filter': 00208 return true; 00209 default: 00210 return parent::supports( $feature ); 00211 } 00212 } 00213 00219 protected function queryFeatures( &$query ) { 00220 foreach ( $this->features as $feature => $value ) { 00221 if ( $feature === 'title-suffix-filter' && $value ) { 00222 $query['conds'][] = 'page_title' . $this->db->buildLike( $this->db->anyString(), $value ); 00223 } 00224 } 00225 } 00226 00232 function queryNamespaces( &$query ) { 00233 if ( is_array( $this->namespaces ) ) { 00234 if ( count( $this->namespaces ) === 0 ) { 00235 $this->namespaces[] = '0'; 00236 } 00237 $query['conds']['page_namespace'] = $this->namespaces; 00238 } 00239 } 00240 00246 protected function limitResult( &$query ) { 00247 $query['options']['LIMIT'] = $this->limit; 00248 $query['options']['OFFSET'] = $this->offset; 00249 } 00250 00259 function getQuery( $filteredTerm, $fulltext ) { 00260 $query = array( 00261 'tables' => array(), 00262 'fields' => array(), 00263 'conds' => array(), 00264 'options' => array(), 00265 'joins' => array(), 00266 ); 00267 00268 $this->queryMain( $query, $filteredTerm, $fulltext ); 00269 $this->queryFeatures( $query ); 00270 $this->queryNamespaces( $query ); 00271 $this->limitResult( $query ); 00272 00273 return $query; 00274 } 00275 00281 function getIndexField( $fulltext ) { 00282 return $fulltext ? 'si_text' : 'si_title'; 00283 } 00284 00293 function queryMain( &$query, $filteredTerm, $fulltext ) { 00294 $match = $this->parseQuery( $filteredTerm, $fulltext ); 00295 $query['tables'][] = 'page'; 00296 $query['tables'][] = 'searchindex'; 00297 $query['fields'][] = 'page_id'; 00298 $query['fields'][] = 'page_namespace'; 00299 $query['fields'][] = 'page_title'; 00300 $query['conds'][] = 'page_id=si_page'; 00301 $query['conds'][] = $match; 00302 } 00303 00310 function getCountQuery( $filteredTerm, $fulltext ) { 00311 $match = $this->parseQuery( $filteredTerm, $fulltext ); 00312 00313 $query = array( 00314 'tables' => array( 'page', 'searchindex' ), 00315 'fields' => array( 'COUNT(*) as c' ), 00316 'conds' => array( 'page_id=si_page', $match ), 00317 'options' => array(), 00318 'joins' => array(), 00319 ); 00320 00321 $this->queryFeatures( $query ); 00322 $this->queryNamespaces( $query ); 00323 00324 return $query; 00325 } 00326 00335 function update( $id, $title, $text ) { 00336 $dbw = wfGetDB( DB_MASTER ); 00337 $dbw->replace( 'searchindex', 00338 array( 'si_page' ), 00339 array( 00340 'si_page' => $id, 00341 'si_title' => $this->normalizeText( $title ), 00342 'si_text' => $this->normalizeText( $text ) 00343 ), __METHOD__ ); 00344 } 00345 00353 function updateTitle( $id, $title ) { 00354 $dbw = wfGetDB( DB_MASTER ); 00355 00356 $dbw->update( 'searchindex', 00357 array( 'si_title' => $this->normalizeText( $title ) ), 00358 array( 'si_page' => $id ), 00359 __METHOD__, 00360 array( $dbw->lowPriorityOption() ) ); 00361 } 00362 00370 function delete( $id, $title ) { 00371 $dbw = wfGetDB( DB_MASTER ); 00372 00373 $dbw->delete( 'searchindex', array( 'si_page' => $id ), __METHOD__ ); 00374 } 00375 00382 function normalizeText( $string ) { 00383 global $wgContLang; 00384 00385 wfProfileIn( __METHOD__ ); 00386 00387 $out = parent::normalizeText( $string ); 00388 00389 // MySQL fulltext index doesn't grok utf-8, so we 00390 // need to fold cases and convert to hex 00391 $out = preg_replace_callback( 00392 "/([\\xc0-\\xff][\\x80-\\xbf]*)/", 00393 array( $this, 'stripForSearchCallback' ), 00394 $wgContLang->lc( $out ) ); 00395 00396 // And to add insult to injury, the default indexing 00397 // ignores short words... Pad them so we can pass them 00398 // through without reconfiguring the server... 00399 $minLength = $this->minSearchLength(); 00400 if ( $minLength > 1 ) { 00401 $n = $minLength - 1; 00402 $out = preg_replace( 00403 "/\b(\w{1,$n})\b/", 00404 "$1u800", 00405 $out ); 00406 } 00407 00408 // Periods within things like hostnames and IP addresses 00409 // are also important -- we want a search for "example.com" 00410 // or "192.168.1.1" to work sanely. 00411 // 00412 // MySQL's search seems to ignore them, so you'd match on 00413 // "example.wikipedia.com" and "192.168.83.1" as well. 00414 $out = preg_replace( 00415 "/(\w)\.(\w|\*)/u", 00416 "$1u82e$2", 00417 $out ); 00418 00419 wfProfileOut( __METHOD__ ); 00420 00421 return $out; 00422 } 00423 00431 protected function stripForSearchCallback( $matches ) { 00432 return 'u8' . bin2hex( $matches[1] ); 00433 } 00434 00441 protected function minSearchLength() { 00442 if ( is_null( self::$mMinSearchLength ) ) { 00443 $sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'"; 00444 00445 $dbr = wfGetDB( DB_SLAVE ); 00446 $result = $dbr->query( $sql ); 00447 $row = $result->fetchObject(); 00448 $result->free(); 00449 00450 if ( $row && $row->Variable_name == 'ft_min_word_len' ) { 00451 self::$mMinSearchLength = intval( $row->Value ); 00452 } else { 00453 self::$mMinSearchLength = 0; 00454 } 00455 } 00456 return self::$mMinSearchLength; 00457 } 00458 }