MediaWiki
REL1_22
|
00001 <?php 00032 class SearchEngine { 00033 var $limit = 10; 00034 var $offset = 0; 00035 var $prefix = ''; 00036 var $searchTerms = array(); 00037 var $namespaces = array( NS_MAIN ); 00038 var $showRedirects = false; 00039 00041 protected $features = array(); 00042 00046 protected $db; 00047 00048 function __construct( $db = null ) { 00049 if ( $db ) { 00050 $this->db = $db; 00051 } else { 00052 $this->db = wfGetDB( DB_SLAVE ); 00053 } 00054 } 00055 00064 function searchText( $term ) { 00065 return null; 00066 } 00067 00076 function searchTitle( $term ) { 00077 return null; 00078 } 00079 00085 function acceptListRedirects() { 00086 wfDeprecated( __METHOD__, '1.18' ); 00087 return $this->supports( 'list-redirects' ); 00088 } 00089 00095 public function supports( $feature ) { 00096 switch ( $feature ) { 00097 case 'list-redirects': 00098 case 'search-update': 00099 return true; 00100 case 'title-suffix-filter': 00101 default: 00102 return false; 00103 } 00104 } 00105 00113 public function setFeatureData( $feature, $data ) { 00114 $this->features[$feature] = $data; 00115 } 00116 00125 public function normalizeText( $string ) { 00126 global $wgContLang; 00127 00128 // Some languages such as Chinese require word segmentation 00129 return $wgContLang->segmentByWord( $string ); 00130 } 00131 00136 function transformSearchTerm( $term ) { 00137 return $term; 00138 } 00139 00147 public static function getNearMatch( $searchterm ) { 00148 $title = self::getNearMatchInternal( $searchterm ); 00149 00150 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) ); 00151 return $title; 00152 } 00153 00161 public static function getNearMatchResultSet( $searchterm ) { 00162 return new SearchNearMatchResultSet( self::getNearMatch( $searchterm ) ); 00163 } 00164 00169 private static function getNearMatchInternal( $searchterm ) { 00170 global $wgContLang, $wgEnableSearchContributorsByIP; 00171 00172 $allSearchTerms = array( $searchterm ); 00173 00174 if ( $wgContLang->hasVariants() ) { 00175 $allSearchTerms = array_merge( $allSearchTerms, $wgContLang->autoConvertToAllVariants( $searchterm ) ); 00176 } 00177 00178 $titleResult = null; 00179 if ( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) { 00180 return $titleResult; 00181 } 00182 00183 foreach ( $allSearchTerms as $term ) { 00184 00185 # Exact match? No need to look further. 00186 $title = Title::newFromText( $term ); 00187 if ( is_null( $title ) ) { 00188 return null; 00189 } 00190 00191 # Try files if searching in the Media: namespace 00192 if ( $title->getNamespace() == NS_MEDIA ) { 00193 $title = Title::makeTitle( NS_FILE, $title->getText() ); 00194 } 00195 00196 if ( $title->isSpecialPage() || $title->isExternal() || $title->exists() ) { 00197 return $title; 00198 } 00199 00200 # See if it still otherwise has content is some sane sense 00201 $page = WikiPage::factory( $title ); 00202 if ( $page->hasViewableContent() ) { 00203 return $title; 00204 } 00205 00206 if ( !wfRunHooks( 'SearchAfterNoDirectMatch', array( $term, &$title ) ) ) { 00207 return $title; 00208 } 00209 00210 # Now try all lower case (i.e. first letter capitalized) 00211 $title = Title::newFromText( $wgContLang->lc( $term ) ); 00212 if ( $title && $title->exists() ) { 00213 return $title; 00214 } 00215 00216 # Now try capitalized string 00217 $title = Title::newFromText( $wgContLang->ucwords( $term ) ); 00218 if ( $title && $title->exists() ) { 00219 return $title; 00220 } 00221 00222 # Now try all upper case 00223 $title = Title::newFromText( $wgContLang->uc( $term ) ); 00224 if ( $title && $title->exists() ) { 00225 return $title; 00226 } 00227 00228 # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc 00229 $title = Title::newFromText( $wgContLang->ucwordbreaks( $term ) ); 00230 if ( $title && $title->exists() ) { 00231 return $title; 00232 } 00233 00234 // Give hooks a chance at better match variants 00235 $title = null; 00236 if ( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) { 00237 return $title; 00238 } 00239 } 00240 00241 $title = Title::newFromText( $searchterm ); 00242 00243 # Entering an IP address goes to the contributions page 00244 if ( $wgEnableSearchContributorsByIP ) { 00245 if ( ( $title->getNamespace() == NS_USER && User::isIP( $title->getText() ) ) 00246 || User::isIP( trim( $searchterm ) ) ) { 00247 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() ); 00248 } 00249 } 00250 00251 # Entering a user goes to the user page whether it's there or not 00252 if ( $title->getNamespace() == NS_USER ) { 00253 return $title; 00254 } 00255 00256 # Go to images that exist even if there's no local page. 00257 # There may have been a funny upload, or it may be on a shared 00258 # file repository such as Wikimedia Commons. 00259 if ( $title->getNamespace() == NS_FILE ) { 00260 $image = wfFindFile( $title ); 00261 if ( $image ) { 00262 return $title; 00263 } 00264 } 00265 00266 # MediaWiki namespace? Page may be "implied" if not customized. 00267 # Just return it, with caps forced as the message system likes it. 00268 if ( $title->getNamespace() == NS_MEDIAWIKI ) { 00269 return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) ); 00270 } 00271 00272 # Quoted term? Try without the quotes... 00273 $matches = array(); 00274 if ( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) { 00275 return SearchEngine::getNearMatch( $matches[1] ); 00276 } 00277 00278 return null; 00279 } 00280 00281 public static function legalSearchChars() { 00282 return "A-Za-z_'.0-9\\x80-\\xFF\\-"; 00283 } 00284 00292 function setLimitOffset( $limit, $offset = 0 ) { 00293 $this->limit = intval( $limit ); 00294 $this->offset = intval( $offset ); 00295 } 00296 00303 function setNamespaces( $namespaces ) { 00304 $this->namespaces = $namespaces; 00305 } 00306 00314 function replacePrefixes( $query ) { 00315 global $wgContLang; 00316 00317 $parsed = $query; 00318 if ( strpos( $query, ':' ) === false ) { // nothing to do 00319 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) ); 00320 return $parsed; 00321 } 00322 00323 $allkeyword = wfMessage( 'searchall' )->inContentLanguage()->text() . ":"; 00324 if ( strncmp( $query, $allkeyword, strlen( $allkeyword ) ) == 0 ) { 00325 $this->namespaces = null; 00326 $parsed = substr( $query, strlen( $allkeyword ) ); 00327 } elseif ( strpos( $query, ':' ) !== false ) { 00328 $prefix = substr( $query, 0, strpos( $query, ':' ) ); 00329 $index = $wgContLang->getNsIndex( $prefix ); 00330 if ( $index !== false ) { 00331 $this->namespaces = array( $index ); 00332 $parsed = substr( $query, strlen( $prefix ) + 1 ); 00333 } 00334 } 00335 if ( trim( $parsed ) == '' ) { 00336 $parsed = $query; // prefix was the whole query 00337 } 00338 00339 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) ); 00340 00341 return $parsed; 00342 } 00343 00348 public static function searchableNamespaces() { 00349 global $wgContLang; 00350 $arr = array(); 00351 foreach ( $wgContLang->getNamespaces() as $ns => $name ) { 00352 if ( $ns >= NS_MAIN ) { 00353 $arr[$ns] = $name; 00354 } 00355 } 00356 00357 wfRunHooks( 'SearchableNamespaces', array( &$arr ) ); 00358 return $arr; 00359 } 00360 00368 public static function userNamespaces( $user ) { 00369 global $wgSearchEverythingOnlyLoggedIn; 00370 00371 $searchableNamespaces = SearchEngine::searchableNamespaces(); 00372 00373 // get search everything preference, that can be set to be read for logged-in users 00374 // it overrides other options 00375 if ( !$wgSearchEverythingOnlyLoggedIn || $user->isLoggedIn() ) { 00376 if ( $user->getOption( 'searcheverything' ) ) { 00377 return array_keys( $searchableNamespaces ); 00378 } 00379 } 00380 00381 $arr = array(); 00382 foreach ( $searchableNamespaces as $ns => $name ) { 00383 if ( $user->getOption( 'searchNs' . $ns ) ) { 00384 $arr[] = $ns; 00385 } 00386 } 00387 00388 return $arr; 00389 } 00390 00396 public static function userHighlightPrefs() { 00397 $contextlines = 2; // Hardcode this. Old defaults sucked. :) 00398 $contextchars = 75; // same as above.... :P 00399 return array( $contextlines, $contextchars ); 00400 } 00401 00407 public static function defaultNamespaces() { 00408 global $wgNamespacesToBeSearchedDefault; 00409 00410 return array_keys( $wgNamespacesToBeSearchedDefault, true ); 00411 } 00412 00420 public static function namespacesAsText( $namespaces ) { 00421 global $wgContLang; 00422 00423 $formatted = array_map( array( $wgContLang, 'getFormattedNsText' ), $namespaces ); 00424 foreach ( $formatted as $key => $ns ) { 00425 if ( empty( $ns ) ) { 00426 $formatted[$key] = wfMessage( 'blanknamespace' )->text(); 00427 } 00428 } 00429 return $formatted; 00430 } 00431 00437 public static function helpNamespaces() { 00438 global $wgNamespacesToBeSearchedHelp; 00439 00440 return array_keys( $wgNamespacesToBeSearchedHelp, true ); 00441 } 00442 00449 function filter( $text ) { 00450 $lc = $this->legalSearchChars(); 00451 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) ); 00452 } 00460 public static function create( $type = null ) { 00461 global $wgSearchType; 00462 $dbr = null; 00463 00464 $alternatives = self::getSearchTypes(); 00465 00466 if ( $type && in_array( $type, $alternatives ) ) { 00467 $class = $type; 00468 } elseif ( $wgSearchType !== null ) { 00469 $class = $wgSearchType; 00470 } else { 00471 $dbr = wfGetDB( DB_SLAVE ); 00472 $class = $dbr->getSearchEngine(); 00473 } 00474 00475 $search = new $class( $dbr ); 00476 $search->setLimitOffset( 0, 0 ); 00477 return $search; 00478 } 00479 00486 public static function getSearchTypes() { 00487 global $wgSearchType, $wgSearchTypeAlternatives; 00488 static $alternatives = null; 00489 if ( $alternatives === null ) { 00490 $alternatives = $wgSearchTypeAlternatives ?: array(); 00491 array_unshift( $alternatives, $wgSearchType ); 00492 } 00493 return $alternatives; 00494 } 00495 00505 function update( $id, $title, $text ) { 00506 // no-op 00507 } 00508 00517 function updateTitle( $id, $title ) { 00518 // no-op 00519 } 00520 00529 function delete( $id, $title ) { 00530 // no-op 00531 } 00532 00538 public static function getOpenSearchTemplate() { 00539 global $wgOpenSearchTemplate, $wgCanonicalServer; 00540 if ( $wgOpenSearchTemplate ) { 00541 return $wgOpenSearchTemplate; 00542 } else { 00543 $ns = implode( '|', SearchEngine::defaultNamespaces() ); 00544 if ( !$ns ) { 00545 $ns = "0"; 00546 } 00547 return $wgCanonicalServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace=' . $ns; 00548 } 00549 } 00550 00561 public function getTextFromContent( Title $t, Content $c = null ) { 00562 return $c ? $c->getTextForSearchIndex() : ''; 00563 } 00564 00572 public function textAlreadyUpdatedForIndex() { 00573 return false; 00574 } 00575 } 00576 00580 class SearchResultSet { 00588 function termMatches() { 00589 return array(); 00590 } 00591 00592 function numRows() { 00593 return 0; 00594 } 00595 00602 function hasResults() { 00603 return false; 00604 } 00605 00616 function getTotalHits() { 00617 return null; 00618 } 00619 00626 function hasSuggestion() { 00627 return false; 00628 } 00629 00633 function getSuggestionQuery() { 00634 return null; 00635 } 00636 00640 function getSuggestionSnippet() { 00641 return ''; 00642 } 00643 00650 function getInfo() { 00651 return null; 00652 } 00653 00659 function getInterwikiResults() { 00660 return null; 00661 } 00662 00668 function hasInterwikiResults() { 00669 return $this->getInterwikiResults() != null; 00670 } 00671 00678 function next() { 00679 return false; 00680 } 00681 00685 function free() { 00686 // ... 00687 } 00688 } 00689 00693 class SqlSearchResultSet extends SearchResultSet { 00694 00695 protected $mResultSet; 00696 00697 function __construct( $resultSet, $terms ) { 00698 $this->mResultSet = $resultSet; 00699 $this->mTerms = $terms; 00700 } 00701 00702 function termMatches() { 00703 return $this->mTerms; 00704 } 00705 00706 function numRows() { 00707 if ( $this->mResultSet === false ) { 00708 return false; 00709 } 00710 00711 return $this->mResultSet->numRows(); 00712 } 00713 00714 function next() { 00715 if ( $this->mResultSet === false ) { 00716 return false; 00717 } 00718 00719 $row = $this->mResultSet->fetchObject(); 00720 if ( $row === false ) { 00721 return false; 00722 } 00723 00724 return SearchResult::newFromRow( $row ); 00725 } 00726 00727 function free() { 00728 if ( $this->mResultSet === false ) { 00729 return false; 00730 } 00731 00732 $this->mResultSet->free(); 00733 } 00734 } 00735 00739 class SearchResultTooMany { 00740 # # Some search engines may bail out if too many matches are found 00741 } 00742 00749 class SearchResult { 00750 00754 var $mRevision = null; 00755 var $mImage = null; 00756 00760 var $mTitle; 00761 00765 var $mText; 00766 00773 public static function newFromTitle( $title ) { 00774 $result = new self(); 00775 $result->initFromTitle( $title ); 00776 return $result; 00777 } 00784 public static function newFromRow( $row ) { 00785 $result = new self(); 00786 $result->initFromRow( $row ); 00787 return $result; 00788 } 00789 00790 public function __construct( $row = null ) { 00791 if ( !is_null( $row ) ) { 00792 // Backwards compatibility with pre-1.17 callers 00793 $this->initFromRow( $row ); 00794 } 00795 } 00796 00803 protected function initFromRow( $row ) { 00804 $this->initFromTitle( Title::makeTitle( $row->page_namespace, $row->page_title ) ); 00805 } 00806 00813 protected function initFromTitle( $title ) { 00814 $this->mTitle = $title; 00815 if ( !is_null( $this->mTitle ) ) { 00816 $id = false; 00817 wfRunHooks( 'SearchResultInitFromTitle', array( $title, &$id ) ); 00818 $this->mRevision = Revision::newFromTitle( 00819 $this->mTitle, $id, Revision::READ_NORMAL ); 00820 if ( $this->mTitle->getNamespace() === NS_FILE ) { 00821 $this->mImage = wfFindFile( $this->mTitle ); 00822 } 00823 } 00824 } 00825 00831 function isBrokenTitle() { 00832 if ( is_null( $this->mTitle ) ) { 00833 return true; 00834 } 00835 return false; 00836 } 00837 00843 function isMissingRevision() { 00844 return !$this->mRevision && !$this->mImage; 00845 } 00846 00850 function getTitle() { 00851 return $this->mTitle; 00852 } 00853 00857 function getScore() { 00858 return null; 00859 } 00860 00864 protected function initText() { 00865 if ( !isset( $this->mText ) ) { 00866 if ( $this->mRevision != null ) { 00867 $this->mText = SearchEngine::create() 00868 ->getTextFromContent( $this->mTitle, $this->mRevision->getContent() ); 00869 } else { // TODO: can we fetch raw wikitext for commons images? 00870 $this->mText = ''; 00871 } 00872 } 00873 } 00874 00879 function getTextSnippet( $terms ) { 00880 global $wgAdvancedSearchHighlighting; 00881 $this->initText(); 00882 00883 // TODO: make highliter take a content object. Make ContentHandler a factory for SearchHighliter. 00884 list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs(); 00885 $h = new SearchHighlighter(); 00886 if ( $wgAdvancedSearchHighlighting ) { 00887 return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars ); 00888 } else { 00889 return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars ); 00890 } 00891 } 00892 00897 function getTitleSnippet( $terms ) { 00898 return ''; 00899 } 00900 00905 function getRedirectSnippet( $terms ) { 00906 return ''; 00907 } 00908 00912 function getRedirectTitle() { 00913 return null; 00914 } 00915 00919 function getSectionSnippet() { 00920 return ''; 00921 } 00922 00926 function getSectionTitle() { 00927 return null; 00928 } 00929 00933 function getTimestamp() { 00934 if ( $this->mRevision ) { 00935 return $this->mRevision->getTimestamp(); 00936 } elseif ( $this->mImage ) { 00937 return $this->mImage->getTimestamp(); 00938 } 00939 return ''; 00940 } 00941 00945 function getWordCount() { 00946 $this->initText(); 00947 return str_word_count( $this->mText ); 00948 } 00949 00953 function getByteSize() { 00954 $this->initText(); 00955 return strlen( $this->mText ); 00956 } 00957 00961 function hasRelated() { 00962 return false; 00963 } 00964 00968 function getInterwikiPrefix() { 00969 return ''; 00970 } 00971 } 00975 class SearchNearMatchResultSet extends SearchResultSet { 00976 private $fetched = false; 00980 public function __construct( $match ) { 00981 $this->result = $match; 00982 } 00983 public function hasResult() { 00984 return (bool)$this->result; 00985 } 00986 public function numRows() { 00987 return $this->hasResults() ? 1 : 0; 00988 } 00989 public function next() { 00990 if ( $this->fetched || !$this->result ) { 00991 return false; 00992 } 00993 $this->fetched = true; 00994 return SearchResult::newFromTitle( $this->result ); 00995 } 00996 } 00997 01003 class SearchHighlighter { 01004 var $mCleanWikitext = true; 01005 01006 function __construct( $cleanupWikitext = true ) { 01007 $this->mCleanWikitext = $cleanupWikitext; 01008 } 01009 01019 public function highlightText( $text, $terms, $contextlines, $contextchars ) { 01020 global $wgContLang; 01021 global $wgSearchHighlightBoundaries; 01022 $fname = __METHOD__; 01023 01024 if ( $text == '' ) { 01025 return ''; 01026 } 01027 01028 // spli text into text + templates/links/tables 01029 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)"; 01030 // first capture group is for detecting nested templates/links/tables/references 01031 $endPatterns = array( 01032 1 => '/(\{\{)|(\}\})/', // template 01033 2 => '/(\[\[)|(\]\])/', // image 01034 3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table 01035 01036 // @todo FIXME: This should prolly be a hook or something 01037 if ( function_exists( 'wfCite' ) ) { 01038 $spat .= '|(<ref>)'; // references via cite extension 01039 $endPatterns[4] = '/(<ref>)|(<\/ref>)/'; 01040 } 01041 $spat .= '/'; 01042 $textExt = array(); // text extracts 01043 $otherExt = array(); // other extracts 01044 wfProfileIn( "$fname-split" ); 01045 $start = 0; 01046 $textLen = strlen( $text ); 01047 $count = 0; // sequence number to maintain ordering 01048 while ( $start < $textLen ) { 01049 // find start of template/image/table 01050 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) { 01051 $epat = ''; 01052 foreach ( $matches as $key => $val ) { 01053 if ( $key > 0 && $val[1] != - 1 ) { 01054 if ( $key == 2 ) { 01055 // see if this is an image link 01056 $ns = substr( $val[0], 2, - 1 ); 01057 if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) { 01058 break; 01059 } 01060 01061 } 01062 $epat = $endPatterns[$key]; 01063 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) ); 01064 $start = $val[1]; 01065 break; 01066 } 01067 } 01068 if ( $epat ) { 01069 // find end (and detect any nested elements) 01070 $level = 0; 01071 $offset = $start + 1; 01072 $found = false; 01073 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) { 01074 if ( array_key_exists( 2, $endMatches ) ) { 01075 // found end 01076 if ( $level == 0 ) { 01077 $len = strlen( $endMatches[2][0] ); 01078 $off = $endMatches[2][1]; 01079 $this->splitAndAdd( $otherExt, $count, 01080 substr( $text, $start, $off + $len - $start ) ); 01081 $start = $off + $len; 01082 $found = true; 01083 break; 01084 } else { 01085 // end of nested element 01086 $level -= 1; 01087 } 01088 } else { 01089 // nested 01090 $level += 1; 01091 } 01092 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] ); 01093 } 01094 if ( ! $found ) { 01095 // couldn't find appropriate closing tag, skip 01096 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) ); 01097 $start += strlen( $matches[0][0] ); 01098 } 01099 continue; 01100 } 01101 } 01102 // else: add as text extract 01103 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) ); 01104 break; 01105 } 01106 01107 $all = $textExt + $otherExt; // these have disjunct key sets 01108 01109 wfProfileOut( "$fname-split" ); 01110 01111 // prepare regexps 01112 foreach ( $terms as $index => $term ) { 01113 // manually do upper/lowercase stuff for utf-8 since PHP won't do it 01114 if ( preg_match( '/[\x80-\xff]/', $term ) ) { 01115 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] ); 01116 } else { 01117 $terms[$index] = $term; 01118 } 01119 } 01120 $anyterm = implode( '|', $terms ); 01121 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms ); 01122 01123 // @todo FIXME: A hack to scale contextchars, a correct solution 01124 // would be to have contextchars actually be char and not byte 01125 // length, and do proper utf-8 substrings and lengths everywhere, 01126 // but PHP is making that very hard and unclean to implement :( 01127 $scale = strlen( $anyterm ) / mb_strlen( $anyterm ); 01128 $contextchars = intval( $contextchars * $scale ); 01129 01130 $patPre = "(^|$wgSearchHighlightBoundaries)"; 01131 $patPost = "($wgSearchHighlightBoundaries|$)"; 01132 01133 $pat1 = "/(" . $phrase . ")/ui"; 01134 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui"; 01135 01136 wfProfileIn( "$fname-extract" ); 01137 01138 $left = $contextlines; 01139 01140 $snippets = array(); 01141 $offsets = array(); 01142 01143 // show beginning only if it contains all words 01144 $first = 0; 01145 $firstText = ''; 01146 foreach ( $textExt as $index => $line ) { 01147 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) { 01148 $firstText = $this->extract( $line, 0, $contextchars * $contextlines ); 01149 $first = $index; 01150 break; 01151 } 01152 } 01153 if ( $firstText ) { 01154 $succ = true; 01155 // check if first text contains all terms 01156 foreach ( $terms as $term ) { 01157 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) { 01158 $succ = false; 01159 break; 01160 } 01161 } 01162 if ( $succ ) { 01163 $snippets[$first] = $firstText; 01164 $offsets[$first] = 0; 01165 } 01166 } 01167 if ( ! $snippets ) { 01168 // match whole query on text 01169 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets ); 01170 // match whole query on templates/tables/images 01171 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets ); 01172 // match any words on text 01173 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets ); 01174 // match any words on templates/tables/images 01175 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets ); 01176 01177 ksort( $snippets ); 01178 } 01179 01180 // add extra chars to each snippet to make snippets constant size 01181 $extended = array(); 01182 if ( count( $snippets ) == 0 ) { 01183 // couldn't find the target words, just show beginning of article 01184 if ( array_key_exists( $first, $all ) ) { 01185 $targetchars = $contextchars * $contextlines; 01186 $snippets[$first] = ''; 01187 $offsets[$first] = 0; 01188 } 01189 } else { 01190 // if begin of the article contains the whole phrase, show only that !! 01191 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] ) 01192 && $offsets[$first] < $contextchars * 2 ) { 01193 $snippets = array( $first => $snippets[$first] ); 01194 } 01195 01196 // calc by how much to extend existing snippets 01197 $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) ); 01198 } 01199 01200 foreach ( $snippets as $index => $line ) { 01201 $extended[$index] = $line; 01202 $len = strlen( $line ); 01203 if ( $len < $targetchars - 20 ) { 01204 // complete this line 01205 if ( $len < strlen( $all[$index] ) ) { 01206 $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] ); 01207 $len = strlen( $extended[$index] ); 01208 } 01209 01210 // add more lines 01211 $add = $index + 1; 01212 while ( $len < $targetchars - 20 01213 && array_key_exists( $add, $all ) 01214 && !array_key_exists( $add, $snippets ) ) { 01215 $offsets[$add] = 0; 01216 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] ); 01217 $extended[$add] = $tt; 01218 $len += strlen( $tt ); 01219 $add++; 01220 } 01221 } 01222 } 01223 01224 // $snippets = array_map( 'htmlspecialchars', $extended ); 01225 $snippets = $extended; 01226 $last = - 1; 01227 $extract = ''; 01228 foreach ( $snippets as $index => $line ) { 01229 if ( $last == - 1 ) { 01230 $extract .= $line; // first line 01231 } elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) ) { 01232 $extract .= " " . $line; // continous lines 01233 } else { 01234 $extract .= '<b> ... </b>' . $line; 01235 } 01236 01237 $last = $index; 01238 } 01239 if ( $extract ) { 01240 $extract .= '<b> ... </b>'; 01241 } 01242 01243 $processed = array(); 01244 foreach ( $terms as $term ) { 01245 if ( ! isset( $processed[$term] ) ) { 01246 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word 01247 $extract = preg_replace( $pat3, 01248 "\\1<span class='searchmatch'>\\2</span>\\3", $extract ); 01249 $processed[$term] = true; 01250 } 01251 } 01252 01253 wfProfileOut( "$fname-extract" ); 01254 01255 return $extract; 01256 } 01257 01265 function splitAndAdd( &$extracts, &$count, $text ) { 01266 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text ); 01267 foreach ( $split as $line ) { 01268 $tt = trim( $line ); 01269 if ( $tt ) { 01270 $extracts[$count++] = $tt; 01271 } 01272 } 01273 } 01274 01281 function caseCallback( $matches ) { 01282 global $wgContLang; 01283 if ( strlen( $matches[0] ) > 1 ) { 01284 return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']'; 01285 } else { 01286 return $matches[0]; 01287 } 01288 } 01289 01300 function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) { 01301 if ( $start != 0 ) { 01302 $start = $this->position( $text, $start, 1 ); 01303 } 01304 if ( $end >= strlen( $text ) ) { 01305 $end = strlen( $text ); 01306 } else { 01307 $end = $this->position( $text, $end ); 01308 } 01309 01310 if ( !is_null( $posStart ) ) { 01311 $posStart = $start; 01312 } 01313 if ( !is_null( $posEnd ) ) { 01314 $posEnd = $end; 01315 } 01316 01317 if ( $end > $start ) { 01318 return substr( $text, $start, $end - $start ); 01319 } else { 01320 return ''; 01321 } 01322 } 01323 01332 function position( $text, $point, $offset = 0 ) { 01333 $tolerance = 10; 01334 $s = max( 0, $point - $tolerance ); 01335 $l = min( strlen( $text ), $point + $tolerance ) - $s; 01336 $m = array(); 01337 if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) { 01338 return $m[0][1] + $s + $offset; 01339 } else { 01340 // check if point is on a valid first UTF8 char 01341 $char = ord( $text[$point] ); 01342 while ( $char >= 0x80 && $char < 0xc0 ) { 01343 // skip trailing bytes 01344 $point++; 01345 if ( $point >= strlen( $text ) ) { 01346 return strlen( $text ); 01347 } 01348 $char = ord( $text[$point] ); 01349 } 01350 return $point; 01351 01352 } 01353 } 01354 01366 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) { 01367 if ( $linesleft == 0 ) { 01368 return; // nothing to do 01369 } 01370 foreach ( $extracts as $index => $line ) { 01371 if ( array_key_exists( $index, $out ) ) { 01372 continue; // this line already highlighted 01373 } 01374 01375 $m = array(); 01376 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) { 01377 continue; 01378 } 01379 01380 $offset = $m[0][1]; 01381 $len = strlen( $m[0][0] ); 01382 if ( $offset + $len < $contextchars ) { 01383 $begin = 0; 01384 } elseif ( $len > $contextchars ) { 01385 $begin = $offset; 01386 } else { 01387 $begin = $offset + intval( ( $len - $contextchars ) / 2 ); 01388 } 01389 01390 $end = $begin + $contextchars; 01391 01392 $posBegin = $begin; 01393 // basic snippet from this line 01394 $out[$index] = $this->extract( $line, $begin, $end, $posBegin ); 01395 $offsets[$index] = $posBegin; 01396 $linesleft--; 01397 if ( $linesleft == 0 ) { 01398 return; 01399 } 01400 } 01401 } 01402 01408 function removeWiki( $text ) { 01409 $fname = __METHOD__; 01410 wfProfileIn( $fname ); 01411 01412 // $text = preg_replace( "/'{2,5}/", "", $text ); 01413 // $text = preg_replace( "/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text ); 01414 // $text = preg_replace( "/\[\[([^]|]+)\]\]/", "\\1", $text ); 01415 // $text = preg_replace( "/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text ); 01416 // $text = preg_replace( "/\\{\\|(.*?)\\|\\}/", "", $text ); 01417 // $text = preg_replace( "/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text ); 01418 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text ); 01419 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text ); 01420 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text ); 01421 $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text ); 01422 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text); 01423 $text = preg_replace( "/<\/?[^>]+>/", "", $text ); 01424 $text = preg_replace( "/'''''/", "", $text ); 01425 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text ); 01426 $text = preg_replace( "/''/", "", $text ); 01427 01428 wfProfileOut( $fname ); 01429 return $text; 01430 } 01431 01438 function linkReplace( $matches ) { 01439 $colon = strpos( $matches[1], ':' ); 01440 if ( $colon === false ) { 01441 return $matches[2]; // replace with caption 01442 } 01443 global $wgContLang; 01444 $ns = substr( $matches[1], 0, $colon ); 01445 $index = $wgContLang->getNsIndex( $ns ); 01446 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) { 01447 return $matches[0]; // return the whole thing 01448 } else { 01449 return $matches[2]; 01450 } 01451 } 01452 01463 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) { 01464 global $wgContLang; 01465 $fname = __METHOD__; 01466 01467 $lines = explode( "\n", $text ); 01468 01469 $terms = implode( '|', $terms ); 01470 $max = intval( $contextchars ) + 1; 01471 $pat1 = "/(.*)($terms)(.{0,$max})/i"; 01472 01473 $lineno = 0; 01474 01475 $extract = ""; 01476 wfProfileIn( "$fname-extract" ); 01477 foreach ( $lines as $line ) { 01478 if ( 0 == $contextlines ) { 01479 break; 01480 } 01481 ++$lineno; 01482 $m = array(); 01483 if ( ! preg_match( $pat1, $line, $m ) ) { 01484 continue; 01485 } 01486 --$contextlines; 01487 // truncate function changes ... to relevant i18n message. 01488 $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false ); 01489 01490 if ( count( $m ) < 3 ) { 01491 $post = ''; 01492 } else { 01493 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false ); 01494 } 01495 01496 $found = $m[2]; 01497 01498 $line = htmlspecialchars( $pre . $found . $post ); 01499 $pat2 = '/(' . $terms . ")/i"; 01500 $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line ); 01501 01502 $extract .= "${line}\n"; 01503 } 01504 wfProfileOut( "$fname-extract" ); 01505 01506 return $extract; 01507 } 01508 01509 } 01510 01517 class SearchEngineDummy extends SearchEngine { 01518 // no-op 01519 }