MediaWiki
REL1_21
|
00001 <?php 00032 class SearchEngine { 00033 var $limit = 10; 00034 var $offset = 0; 00035 var $prefix = ''; 00036 var $searchTerms = array(); 00037 var $namespaces = array( NS_MAIN ); 00038 var $showRedirects = false; 00039 00041 protected $features = array(); 00042 00046 protected $db; 00047 00048 function __construct( $db = null ) { 00049 if ( $db ) { 00050 $this->db = $db; 00051 } else { 00052 $this->db = wfGetDB( DB_SLAVE ); 00053 } 00054 } 00055 00064 function searchText( $term ) { 00065 return null; 00066 } 00067 00076 function searchTitle( $term ) { 00077 return null; 00078 } 00079 00085 function acceptListRedirects() { 00086 wfDeprecated( __METHOD__, '1.18' ); 00087 return $this->supports( 'list-redirects' ); 00088 } 00089 00095 public function supports( $feature ) { 00096 switch( $feature ) { 00097 case 'list-redirects': 00098 return true; 00099 case 'title-suffix-filter': 00100 default: 00101 return false; 00102 } 00103 } 00104 00112 public function setFeatureData( $feature, $data ) { 00113 $this->features[$feature] = $data; 00114 } 00115 00124 public function normalizeText( $string ) { 00125 global $wgContLang; 00126 00127 // Some languages such as Chinese require word segmentation 00128 return $wgContLang->segmentByWord( $string ); 00129 } 00130 00135 function transformSearchTerm( $term ) { 00136 return $term; 00137 } 00138 00146 public static function getNearMatch( $searchterm ) { 00147 $title = self::getNearMatchInternal( $searchterm ); 00148 00149 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) ); 00150 return $title; 00151 } 00152 00160 public static function getNearMatchResultSet( $searchterm ) { 00161 return new SearchNearMatchResultSet( self::getNearMatch( $searchterm ) ); 00162 } 00163 00168 private static function getNearMatchInternal( $searchterm ) { 00169 global $wgContLang, $wgEnableSearchContributorsByIP; 00170 00171 $allSearchTerms = array( $searchterm ); 00172 00173 if ( $wgContLang->hasVariants() ) { 00174 $allSearchTerms = array_merge( $allSearchTerms, $wgContLang->autoConvertToAllVariants( $searchterm ) ); 00175 } 00176 00177 $titleResult = null; 00178 if ( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) { 00179 return $titleResult; 00180 } 00181 00182 foreach ( $allSearchTerms as $term ) { 00183 00184 # Exact match? No need to look further. 00185 $title = Title::newFromText( $term ); 00186 if ( is_null( $title ) ) { 00187 return null; 00188 } 00189 00190 # Try files if searching in the Media: namespace 00191 if ( $title->getNamespace() == NS_MEDIA ) { 00192 $title = Title::makeTitle( NS_FILE, $title->getText() ); 00193 } 00194 00195 if ( $title->isSpecialPage() || $title->isExternal() || $title->exists() ) { 00196 return $title; 00197 } 00198 00199 # See if it still otherwise has content is some sane sense 00200 $page = WikiPage::factory( $title ); 00201 if ( $page->hasViewableContent() ) { 00202 return $title; 00203 } 00204 00205 if ( !wfRunHooks( 'SearchAfterNoDirectMatch', array( $term, &$title ) ) ) { 00206 return $title; 00207 } 00208 00209 # Now try all lower case (i.e. first letter capitalized) 00210 $title = Title::newFromText( $wgContLang->lc( $term ) ); 00211 if ( $title && $title->exists() ) { 00212 return $title; 00213 } 00214 00215 # Now try capitalized string 00216 $title = Title::newFromText( $wgContLang->ucwords( $term ) ); 00217 if ( $title && $title->exists() ) { 00218 return $title; 00219 } 00220 00221 # Now try all upper case 00222 $title = Title::newFromText( $wgContLang->uc( $term ) ); 00223 if ( $title && $title->exists() ) { 00224 return $title; 00225 } 00226 00227 # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc 00228 $title = Title::newFromText( $wgContLang->ucwordbreaks( $term ) ); 00229 if ( $title && $title->exists() ) { 00230 return $title; 00231 } 00232 00233 // Give hooks a chance at better match variants 00234 $title = null; 00235 if ( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) { 00236 return $title; 00237 } 00238 } 00239 00240 $title = Title::newFromText( $searchterm ); 00241 00242 # Entering an IP address goes to the contributions page 00243 if ( $wgEnableSearchContributorsByIP ) { 00244 if ( ( $title->getNamespace() == NS_USER && User::isIP( $title->getText() ) ) 00245 || User::isIP( trim( $searchterm ) ) ) { 00246 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() ); 00247 } 00248 } 00249 00250 # Entering a user goes to the user page whether it's there or not 00251 if ( $title->getNamespace() == NS_USER ) { 00252 return $title; 00253 } 00254 00255 # Go to images that exist even if there's no local page. 00256 # There may have been a funny upload, or it may be on a shared 00257 # file repository such as Wikimedia Commons. 00258 if ( $title->getNamespace() == NS_FILE ) { 00259 $image = wfFindFile( $title ); 00260 if ( $image ) { 00261 return $title; 00262 } 00263 } 00264 00265 # MediaWiki namespace? Page may be "implied" if not customized. 00266 # Just return it, with caps forced as the message system likes it. 00267 if ( $title->getNamespace() == NS_MEDIAWIKI ) { 00268 return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) ); 00269 } 00270 00271 # Quoted term? Try without the quotes... 00272 $matches = array(); 00273 if ( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) { 00274 return SearchEngine::getNearMatch( $matches[1] ); 00275 } 00276 00277 return null; 00278 } 00279 00280 public static function legalSearchChars() { 00281 return "A-Za-z_'.0-9\\x80-\\xFF\\-"; 00282 } 00283 00291 function setLimitOffset( $limit, $offset = 0 ) { 00292 $this->limit = intval( $limit ); 00293 $this->offset = intval( $offset ); 00294 } 00295 00302 function setNamespaces( $namespaces ) { 00303 $this->namespaces = $namespaces; 00304 } 00305 00313 function replacePrefixes( $query ) { 00314 global $wgContLang; 00315 00316 $parsed = $query; 00317 if ( strpos( $query, ':' ) === false ) { // nothing to do 00318 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) ); 00319 return $parsed; 00320 } 00321 00322 $allkeyword = wfMessage( 'searchall' )->inContentLanguage()->text() . ":"; 00323 if ( strncmp( $query, $allkeyword, strlen( $allkeyword ) ) == 0 ) { 00324 $this->namespaces = null; 00325 $parsed = substr( $query, strlen( $allkeyword ) ); 00326 } elseif ( strpos( $query, ':' ) !== false ) { 00327 $prefix = substr( $query, 0, strpos( $query, ':' ) ); 00328 $index = $wgContLang->getNsIndex( $prefix ); 00329 if ( $index !== false ) { 00330 $this->namespaces = array( $index ); 00331 $parsed = substr( $query, strlen( $prefix ) + 1 ); 00332 } 00333 } 00334 if ( trim( $parsed ) == '' ) 00335 $parsed = $query; // prefix was the whole query 00336 00337 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) ); 00338 00339 return $parsed; 00340 } 00341 00346 public static function searchableNamespaces() { 00347 global $wgContLang; 00348 $arr = array(); 00349 foreach ( $wgContLang->getNamespaces() as $ns => $name ) { 00350 if ( $ns >= NS_MAIN ) { 00351 $arr[$ns] = $name; 00352 } 00353 } 00354 00355 wfRunHooks( 'SearchableNamespaces', array( &$arr ) ); 00356 return $arr; 00357 } 00358 00366 public static function userNamespaces( $user ) { 00367 global $wgSearchEverythingOnlyLoggedIn; 00368 00369 $searchableNamespaces = SearchEngine::searchableNamespaces(); 00370 00371 // get search everything preference, that can be set to be read for logged-in users 00372 // it overrides other options 00373 if ( !$wgSearchEverythingOnlyLoggedIn || $user->isLoggedIn() ) { 00374 if ( $user->getOption( 'searcheverything' ) ) { 00375 return array_keys( $searchableNamespaces ); 00376 } 00377 } 00378 00379 $arr = array(); 00380 foreach ( $searchableNamespaces as $ns => $name ) { 00381 if ( $user->getOption( 'searchNs' . $ns ) ) { 00382 $arr[] = $ns; 00383 } 00384 } 00385 00386 return $arr; 00387 } 00388 00394 public static function userHighlightPrefs() { 00395 $contextlines = 2; // Hardcode this. Old defaults sucked. :) 00396 $contextchars = 75; // same as above.... :P 00397 return array( $contextlines, $contextchars ); 00398 } 00399 00405 public static function defaultNamespaces() { 00406 global $wgNamespacesToBeSearchedDefault; 00407 00408 return array_keys( $wgNamespacesToBeSearchedDefault, true ); 00409 } 00410 00418 public static function namespacesAsText( $namespaces ) { 00419 global $wgContLang; 00420 00421 $formatted = array_map( array( $wgContLang, 'getFormattedNsText' ), $namespaces ); 00422 foreach ( $formatted as $key => $ns ) { 00423 if ( empty( $ns ) ) 00424 $formatted[$key] = wfMessage( 'blanknamespace' )->text(); 00425 } 00426 return $formatted; 00427 } 00428 00434 public static function helpNamespaces() { 00435 global $wgNamespacesToBeSearchedHelp; 00436 00437 return array_keys( $wgNamespacesToBeSearchedHelp, true ); 00438 } 00439 00446 function filter( $text ) { 00447 $lc = $this->legalSearchChars(); 00448 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) ); 00449 } 00456 public static function create() { 00457 global $wgSearchType; 00458 $dbr = null; 00459 if ( $wgSearchType ) { 00460 $class = $wgSearchType; 00461 } else { 00462 $dbr = wfGetDB( DB_SLAVE ); 00463 $class = $dbr->getSearchEngine(); 00464 } 00465 $search = new $class( $dbr ); 00466 $search->setLimitOffset( 0, 0 ); 00467 return $search; 00468 } 00469 00479 function update( $id, $title, $text ) { 00480 // no-op 00481 } 00482 00491 function updateTitle( $id, $title ) { 00492 // no-op 00493 } 00494 00500 public static function getOpenSearchTemplate() { 00501 global $wgOpenSearchTemplate, $wgCanonicalServer; 00502 if ( $wgOpenSearchTemplate ) { 00503 return $wgOpenSearchTemplate; 00504 } else { 00505 $ns = implode( '|', SearchEngine::defaultNamespaces() ); 00506 if ( !$ns ) { 00507 $ns = "0"; 00508 } 00509 return $wgCanonicalServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace=' . $ns; 00510 } 00511 } 00512 } 00513 00517 class SearchResultSet { 00525 function termMatches() { 00526 return array(); 00527 } 00528 00529 function numRows() { 00530 return 0; 00531 } 00532 00539 function hasResults() { 00540 return false; 00541 } 00542 00553 function getTotalHits() { 00554 return null; 00555 } 00556 00563 function hasSuggestion() { 00564 return false; 00565 } 00566 00570 function getSuggestionQuery() { 00571 return null; 00572 } 00573 00577 function getSuggestionSnippet() { 00578 return ''; 00579 } 00580 00587 function getInfo() { 00588 return null; 00589 } 00590 00596 function getInterwikiResults() { 00597 return null; 00598 } 00599 00605 function hasInterwikiResults() { 00606 return $this->getInterwikiResults() != null; 00607 } 00608 00615 function next() { 00616 return false; 00617 } 00618 00622 function free() { 00623 // ... 00624 } 00625 } 00626 00630 class SqlSearchResultSet extends SearchResultSet { 00631 00632 protected $mResultSet; 00633 00634 function __construct( $resultSet, $terms ) { 00635 $this->mResultSet = $resultSet; 00636 $this->mTerms = $terms; 00637 } 00638 00639 function termMatches() { 00640 return $this->mTerms; 00641 } 00642 00643 function numRows() { 00644 if ( $this->mResultSet === false ) 00645 return false; 00646 00647 return $this->mResultSet->numRows(); 00648 } 00649 00650 function next() { 00651 if ( $this->mResultSet === false ) 00652 return false; 00653 00654 $row = $this->mResultSet->fetchObject(); 00655 if ( $row === false ) 00656 return false; 00657 00658 return SearchResult::newFromRow( $row ); 00659 } 00660 00661 function free() { 00662 if ( $this->mResultSet === false ) 00663 return false; 00664 00665 $this->mResultSet->free(); 00666 } 00667 } 00668 00672 class SearchResultTooMany { 00673 # # Some search engines may bail out if too many matches are found 00674 } 00675 00682 class SearchResult { 00683 00687 var $mRevision = null; 00688 var $mImage = null; 00689 00693 var $mTitle; 00694 00698 var $mText; 00699 00706 public static function newFromTitle( $title ) { 00707 $result = new self(); 00708 $result->initFromTitle( $title ); 00709 return $result; 00710 } 00717 public static function newFromRow( $row ) { 00718 $result = new self(); 00719 $result->initFromRow( $row ); 00720 return $result; 00721 } 00722 00723 public function __construct( $row = null ) { 00724 if ( !is_null( $row ) ) { 00725 // Backwards compatibility with pre-1.17 callers 00726 $this->initFromRow( $row ); 00727 } 00728 } 00729 00736 protected function initFromRow( $row ) { 00737 $this->initFromTitle( Title::makeTitle( $row->page_namespace, $row->page_title ) ); 00738 } 00739 00746 protected function initFromTitle( $title ) { 00747 $this->mTitle = $title; 00748 if ( !is_null( $this->mTitle ) ) { 00749 $id = false; 00750 wfRunHooks( 'SearchResultInitFromTitle', array( $title, &$id ) ); 00751 $this->mRevision = Revision::newFromTitle( 00752 $this->mTitle, $id, Revision::READ_NORMAL ); 00753 if ( $this->mTitle->getNamespace() === NS_FILE ) 00754 $this->mImage = wfFindFile( $this->mTitle ); 00755 } 00756 } 00757 00763 function isBrokenTitle() { 00764 if ( is_null( $this->mTitle ) ) 00765 return true; 00766 return false; 00767 } 00768 00774 function isMissingRevision() { 00775 return !$this->mRevision && !$this->mImage; 00776 } 00777 00781 function getTitle() { 00782 return $this->mTitle; 00783 } 00784 00788 function getScore() { 00789 return null; 00790 } 00791 00795 protected function initText() { 00796 if ( !isset( $this->mText ) ) { 00797 if ( $this->mRevision != null ) { 00798 //TODO: if we could plug in some code that knows about special content models *and* about 00799 // special features of the search engine, the search could benefit. 00800 $content = $this->mRevision->getContent(); 00801 $this->mText = $content ? $content->getTextForSearchIndex() : ''; 00802 } else { // TODO: can we fetch raw wikitext for commons images? 00803 $this->mText = ''; 00804 } 00805 } 00806 } 00807 00812 function getTextSnippet( $terms ) { 00813 global $wgUser, $wgAdvancedSearchHighlighting; 00814 $this->initText(); 00815 00816 // TODO: make highliter take a content object. Make ContentHandler a factory for SearchHighliter. 00817 list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs( $wgUser ); 00818 $h = new SearchHighlighter(); 00819 if ( $wgAdvancedSearchHighlighting ) 00820 return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars ); 00821 else 00822 return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars ); 00823 } 00824 00829 function getTitleSnippet( $terms ) { 00830 return ''; 00831 } 00832 00837 function getRedirectSnippet( $terms ) { 00838 return ''; 00839 } 00840 00844 function getRedirectTitle() { 00845 return null; 00846 } 00847 00851 function getSectionSnippet() { 00852 return ''; 00853 } 00854 00858 function getSectionTitle() { 00859 return null; 00860 } 00861 00865 function getTimestamp() { 00866 if ( $this->mRevision ) 00867 return $this->mRevision->getTimestamp(); 00868 elseif ( $this->mImage ) 00869 return $this->mImage->getTimestamp(); 00870 return ''; 00871 } 00872 00876 function getWordCount() { 00877 $this->initText(); 00878 return str_word_count( $this->mText ); 00879 } 00880 00884 function getByteSize() { 00885 $this->initText(); 00886 return strlen( $this->mText ); 00887 } 00888 00892 function hasRelated() { 00893 return false; 00894 } 00895 00899 function getInterwikiPrefix() { 00900 return ''; 00901 } 00902 } 00906 class SearchNearMatchResultSet extends SearchResultSet { 00907 private $fetched = false; 00911 public function __construct( $match ) { 00912 $this->result = $match; 00913 } 00914 public function hasResult() { 00915 return (bool)$this->result; 00916 } 00917 public function numRows() { 00918 return $this->hasResults() ? 1 : 0; 00919 } 00920 public function next() { 00921 if ( $this->fetched || !$this->result ) { 00922 return false; 00923 } 00924 $this->fetched = true; 00925 return SearchResult::newFromTitle( $this->result ); 00926 } 00927 } 00928 00934 class SearchHighlighter { 00935 var $mCleanWikitext = true; 00936 00937 function __construct( $cleanupWikitext = true ) { 00938 $this->mCleanWikitext = $cleanupWikitext; 00939 } 00940 00950 public function highlightText( $text, $terms, $contextlines, $contextchars ) { 00951 global $wgContLang; 00952 global $wgSearchHighlightBoundaries; 00953 $fname = __METHOD__; 00954 00955 if ( $text == '' ) 00956 return ''; 00957 00958 // spli text into text + templates/links/tables 00959 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)"; 00960 // first capture group is for detecting nested templates/links/tables/references 00961 $endPatterns = array( 00962 1 => '/(\{\{)|(\}\})/', // template 00963 2 => '/(\[\[)|(\]\])/', // image 00964 3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table 00965 00966 // @todo FIXME: This should prolly be a hook or something 00967 if ( function_exists( 'wfCite' ) ) { 00968 $spat .= '|(<ref>)'; // references via cite extension 00969 $endPatterns[4] = '/(<ref>)|(<\/ref>)/'; 00970 } 00971 $spat .= '/'; 00972 $textExt = array(); // text extracts 00973 $otherExt = array(); // other extracts 00974 wfProfileIn( "$fname-split" ); 00975 $start = 0; 00976 $textLen = strlen( $text ); 00977 $count = 0; // sequence number to maintain ordering 00978 while ( $start < $textLen ) { 00979 // find start of template/image/table 00980 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) { 00981 $epat = ''; 00982 foreach ( $matches as $key => $val ) { 00983 if ( $key > 0 && $val[1] != - 1 ) { 00984 if ( $key == 2 ) { 00985 // see if this is an image link 00986 $ns = substr( $val[0], 2, - 1 ); 00987 if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) 00988 break; 00989 00990 } 00991 $epat = $endPatterns[$key]; 00992 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) ); 00993 $start = $val[1]; 00994 break; 00995 } 00996 } 00997 if ( $epat ) { 00998 // find end (and detect any nested elements) 00999 $level = 0; 01000 $offset = $start + 1; 01001 $found = false; 01002 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) { 01003 if ( array_key_exists( 2, $endMatches ) ) { 01004 // found end 01005 if ( $level == 0 ) { 01006 $len = strlen( $endMatches[2][0] ); 01007 $off = $endMatches[2][1]; 01008 $this->splitAndAdd( $otherExt, $count, 01009 substr( $text, $start, $off + $len - $start ) ); 01010 $start = $off + $len; 01011 $found = true; 01012 break; 01013 } else { 01014 // end of nested element 01015 $level -= 1; 01016 } 01017 } else { 01018 // nested 01019 $level += 1; 01020 } 01021 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] ); 01022 } 01023 if ( ! $found ) { 01024 // couldn't find appropriate closing tag, skip 01025 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) ); 01026 $start += strlen( $matches[0][0] ); 01027 } 01028 continue; 01029 } 01030 } 01031 // else: add as text extract 01032 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) ); 01033 break; 01034 } 01035 01036 $all = $textExt + $otherExt; // these have disjunct key sets 01037 01038 wfProfileOut( "$fname-split" ); 01039 01040 // prepare regexps 01041 foreach ( $terms as $index => $term ) { 01042 // manually do upper/lowercase stuff for utf-8 since PHP won't do it 01043 if ( preg_match( '/[\x80-\xff]/', $term ) ) { 01044 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] ); 01045 } else { 01046 $terms[$index] = $term; 01047 } 01048 } 01049 $anyterm = implode( '|', $terms ); 01050 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms ); 01051 01052 // @todo FIXME: A hack to scale contextchars, a correct solution 01053 // would be to have contextchars actually be char and not byte 01054 // length, and do proper utf-8 substrings and lengths everywhere, 01055 // but PHP is making that very hard and unclean to implement :( 01056 $scale = strlen( $anyterm ) / mb_strlen( $anyterm ); 01057 $contextchars = intval( $contextchars * $scale ); 01058 01059 $patPre = "(^|$wgSearchHighlightBoundaries)"; 01060 $patPost = "($wgSearchHighlightBoundaries|$)"; 01061 01062 $pat1 = "/(" . $phrase . ")/ui"; 01063 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui"; 01064 01065 wfProfileIn( "$fname-extract" ); 01066 01067 $left = $contextlines; 01068 01069 $snippets = array(); 01070 $offsets = array(); 01071 01072 // show beginning only if it contains all words 01073 $first = 0; 01074 $firstText = ''; 01075 foreach ( $textExt as $index => $line ) { 01076 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) { 01077 $firstText = $this->extract( $line, 0, $contextchars * $contextlines ); 01078 $first = $index; 01079 break; 01080 } 01081 } 01082 if ( $firstText ) { 01083 $succ = true; 01084 // check if first text contains all terms 01085 foreach ( $terms as $term ) { 01086 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) { 01087 $succ = false; 01088 break; 01089 } 01090 } 01091 if ( $succ ) { 01092 $snippets[$first] = $firstText; 01093 $offsets[$first] = 0; 01094 } 01095 } 01096 if ( ! $snippets ) { 01097 // match whole query on text 01098 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets ); 01099 // match whole query on templates/tables/images 01100 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets ); 01101 // match any words on text 01102 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets ); 01103 // match any words on templates/tables/images 01104 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets ); 01105 01106 ksort( $snippets ); 01107 } 01108 01109 // add extra chars to each snippet to make snippets constant size 01110 $extended = array(); 01111 if ( count( $snippets ) == 0 ) { 01112 // couldn't find the target words, just show beginning of article 01113 if ( array_key_exists( $first, $all ) ) { 01114 $targetchars = $contextchars * $contextlines; 01115 $snippets[$first] = ''; 01116 $offsets[$first] = 0; 01117 } 01118 } else { 01119 // if begin of the article contains the whole phrase, show only that !! 01120 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] ) 01121 && $offsets[$first] < $contextchars * 2 ) { 01122 $snippets = array ( $first => $snippets[$first] ); 01123 } 01124 01125 // calc by how much to extend existing snippets 01126 $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) ); 01127 } 01128 01129 foreach ( $snippets as $index => $line ) { 01130 $extended[$index] = $line; 01131 $len = strlen( $line ); 01132 if ( $len < $targetchars - 20 ) { 01133 // complete this line 01134 if ( $len < strlen( $all[$index] ) ) { 01135 $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] ); 01136 $len = strlen( $extended[$index] ); 01137 } 01138 01139 // add more lines 01140 $add = $index + 1; 01141 while ( $len < $targetchars - 20 01142 && array_key_exists( $add, $all ) 01143 && !array_key_exists( $add, $snippets ) ) { 01144 $offsets[$add] = 0; 01145 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] ); 01146 $extended[$add] = $tt; 01147 $len += strlen( $tt ); 01148 $add++; 01149 } 01150 } 01151 } 01152 01153 // $snippets = array_map( 'htmlspecialchars', $extended ); 01154 $snippets = $extended; 01155 $last = - 1; 01156 $extract = ''; 01157 foreach ( $snippets as $index => $line ) { 01158 if ( $last == - 1 ) 01159 $extract .= $line; // first line 01160 elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) ) 01161 $extract .= " " . $line; // continous lines 01162 else 01163 $extract .= '<b> ... </b>' . $line; 01164 01165 $last = $index; 01166 } 01167 if ( $extract ) 01168 $extract .= '<b> ... </b>'; 01169 01170 $processed = array(); 01171 foreach ( $terms as $term ) { 01172 if ( ! isset( $processed[$term] ) ) { 01173 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word 01174 $extract = preg_replace( $pat3, 01175 "\\1<span class='searchmatch'>\\2</span>\\3", $extract ); 01176 $processed[$term] = true; 01177 } 01178 } 01179 01180 wfProfileOut( "$fname-extract" ); 01181 01182 return $extract; 01183 } 01184 01192 function splitAndAdd( &$extracts, &$count, $text ) { 01193 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text ); 01194 foreach ( $split as $line ) { 01195 $tt = trim( $line ); 01196 if ( $tt ) 01197 $extracts[$count++] = $tt; 01198 } 01199 } 01200 01207 function caseCallback( $matches ) { 01208 global $wgContLang; 01209 if ( strlen( $matches[0] ) > 1 ) { 01210 return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']'; 01211 } else { 01212 return $matches[0]; 01213 } 01214 } 01215 01226 function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) { 01227 if ( $start != 0 ) { 01228 $start = $this->position( $text, $start, 1 ); 01229 } 01230 if ( $end >= strlen( $text ) ) { 01231 $end = strlen( $text ); 01232 } else { 01233 $end = $this->position( $text, $end ); 01234 } 01235 01236 if ( !is_null( $posStart ) ) { 01237 $posStart = $start; 01238 } 01239 if ( !is_null( $posEnd ) ) { 01240 $posEnd = $end; 01241 } 01242 01243 if ( $end > $start ) { 01244 return substr( $text, $start, $end - $start ); 01245 } else { 01246 return ''; 01247 } 01248 } 01249 01258 function position( $text, $point, $offset = 0 ) { 01259 $tolerance = 10; 01260 $s = max( 0, $point - $tolerance ); 01261 $l = min( strlen( $text ), $point + $tolerance ) - $s; 01262 $m = array(); 01263 if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) { 01264 return $m[0][1] + $s + $offset; 01265 } else { 01266 // check if point is on a valid first UTF8 char 01267 $char = ord( $text[$point] ); 01268 while ( $char >= 0x80 && $char < 0xc0 ) { 01269 // skip trailing bytes 01270 $point++; 01271 if ( $point >= strlen( $text ) ) 01272 return strlen( $text ); 01273 $char = ord( $text[$point] ); 01274 } 01275 return $point; 01276 01277 } 01278 } 01279 01291 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) { 01292 if ( $linesleft == 0 ) 01293 return; // nothing to do 01294 foreach ( $extracts as $index => $line ) { 01295 if ( array_key_exists( $index, $out ) ) 01296 continue; // this line already highlighted 01297 01298 $m = array(); 01299 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) 01300 continue; 01301 01302 $offset = $m[0][1]; 01303 $len = strlen( $m[0][0] ); 01304 if ( $offset + $len < $contextchars ) 01305 $begin = 0; 01306 elseif ( $len > $contextchars ) 01307 $begin = $offset; 01308 else 01309 $begin = $offset + intval( ( $len - $contextchars ) / 2 ); 01310 01311 $end = $begin + $contextchars; 01312 01313 $posBegin = $begin; 01314 // basic snippet from this line 01315 $out[$index] = $this->extract( $line, $begin, $end, $posBegin ); 01316 $offsets[$index] = $posBegin; 01317 $linesleft--; 01318 if ( $linesleft == 0 ) 01319 return; 01320 } 01321 } 01322 01328 function removeWiki( $text ) { 01329 $fname = __METHOD__; 01330 wfProfileIn( $fname ); 01331 01332 // $text = preg_replace( "/'{2,5}/", "", $text ); 01333 // $text = preg_replace( "/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text ); 01334 // $text = preg_replace( "/\[\[([^]|]+)\]\]/", "\\1", $text ); 01335 // $text = preg_replace( "/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text ); 01336 // $text = preg_replace( "/\\{\\|(.*?)\\|\\}/", "", $text ); 01337 // $text = preg_replace( "/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text ); 01338 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text ); 01339 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text ); 01340 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text ); 01341 $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text ); 01342 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text); 01343 $text = preg_replace( "/<\/?[^>]+>/", "", $text ); 01344 $text = preg_replace( "/'''''/", "", $text ); 01345 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text ); 01346 $text = preg_replace( "/''/", "", $text ); 01347 01348 wfProfileOut( $fname ); 01349 return $text; 01350 } 01351 01358 function linkReplace( $matches ) { 01359 $colon = strpos( $matches[1], ':' ); 01360 if ( $colon === false ) 01361 return $matches[2]; // replace with caption 01362 global $wgContLang; 01363 $ns = substr( $matches[1], 0, $colon ); 01364 $index = $wgContLang->getNsIndex( $ns ); 01365 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) 01366 return $matches[0]; // return the whole thing 01367 else 01368 return $matches[2]; 01369 01370 } 01371 01382 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) { 01383 global $wgContLang; 01384 $fname = __METHOD__; 01385 01386 $lines = explode( "\n", $text ); 01387 01388 $terms = implode( '|', $terms ); 01389 $max = intval( $contextchars ) + 1; 01390 $pat1 = "/(.*)($terms)(.{0,$max})/i"; 01391 01392 $lineno = 0; 01393 01394 $extract = ""; 01395 wfProfileIn( "$fname-extract" ); 01396 foreach ( $lines as $line ) { 01397 if ( 0 == $contextlines ) { 01398 break; 01399 } 01400 ++$lineno; 01401 $m = array(); 01402 if ( ! preg_match( $pat1, $line, $m ) ) { 01403 continue; 01404 } 01405 --$contextlines; 01406 // truncate function changes ... to relevant i18n message. 01407 $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false ); 01408 01409 if ( count( $m ) < 3 ) { 01410 $post = ''; 01411 } else { 01412 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false ); 01413 } 01414 01415 $found = $m[2]; 01416 01417 $line = htmlspecialchars( $pre . $found . $post ); 01418 $pat2 = '/(' . $terms . ")/i"; 01419 $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line ); 01420 01421 $extract .= "${line}\n"; 01422 } 01423 wfProfileOut( "$fname-extract" ); 01424 01425 return $extract; 01426 } 01427 01428 } 01429 01436 class SearchEngineDummy extends SearchEngine { 01437 // no-op 01438 }