MediaWiki
REL1_19
|
00001 <?php 00017 class SearchEngine { 00018 var $limit = 10; 00019 var $offset = 0; 00020 var $prefix = ''; 00021 var $searchTerms = array(); 00022 var $namespaces = array( NS_MAIN ); 00023 var $showRedirects = false; 00024 00026 protected $features = array(); 00027 00031 protected $db; 00032 00033 function __construct($db = null) { 00034 if ( $db ) { 00035 $this->db = $db; 00036 } else { 00037 $this->db = wfGetDB( DB_SLAVE ); 00038 } 00039 } 00040 00049 function searchText( $term ) { 00050 return null; 00051 } 00052 00061 function searchTitle( $term ) { 00062 return null; 00063 } 00064 00069 function acceptListRedirects() { 00070 wfDeprecated( __METHOD__, '1.18' ); 00071 return $this->supports( 'list-redirects' ); 00072 } 00073 00079 public function supports( $feature ) { 00080 switch( $feature ) { 00081 case 'list-redirects': 00082 return true; 00083 case 'title-suffix-filter': 00084 default: 00085 return false; 00086 } 00087 } 00088 00096 public function setFeatureData( $feature, $data ) { 00097 $this->features[$feature] = $data; 00098 } 00099 00108 public function normalizeText( $string ) { 00109 global $wgContLang; 00110 00111 // Some languages such as Chinese require word segmentation 00112 return $wgContLang->segmentByWord( $string ); 00113 } 00114 00119 function transformSearchTerm( $term ) { 00120 return $term; 00121 } 00122 00130 public static function getNearMatch( $searchterm ) { 00131 $title = self::getNearMatchInternal( $searchterm ); 00132 00133 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) ); 00134 return $title; 00135 } 00136 00144 public static function getNearMatchResultSet( $searchterm ) { 00145 return new SearchNearMatchResultSet( self::getNearMatch( $searchterm ) ); 00146 } 00147 00151 private static function getNearMatchInternal( $searchterm ) { 00152 global $wgContLang, $wgEnableSearchContributorsByIP; 00153 00154 $allSearchTerms = array( $searchterm ); 00155 00156 if ( $wgContLang->hasVariants() ) { 00157 $allSearchTerms = array_merge( $allSearchTerms, $wgContLang->autoConvertToAllVariants( $searchterm ) ); 00158 } 00159 00160 $titleResult = null; 00161 if ( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) { 00162 return $titleResult; 00163 } 00164 00165 foreach ( $allSearchTerms as $term ) { 00166 00167 # Exact match? No need to look further. 00168 $title = Title::newFromText( $term ); 00169 if ( is_null( $title ) ){ 00170 return null; 00171 } 00172 00173 if ( $title->isSpecialPage() || $title->isExternal() || $title->exists() ) { 00174 return $title; 00175 } 00176 00177 # See if it still otherwise has content is some sane sense 00178 $page = WikiPage::factory( $title ); 00179 if ( $page->hasViewableContent() ) { 00180 return $title; 00181 } 00182 00183 # Now try all lower case (i.e. first letter capitalized) 00184 # 00185 $title = Title::newFromText( $wgContLang->lc( $term ) ); 00186 if ( $title && $title->exists() ) { 00187 return $title; 00188 } 00189 00190 # Now try capitalized string 00191 # 00192 $title = Title::newFromText( $wgContLang->ucwords( $term ) ); 00193 if ( $title && $title->exists() ) { 00194 return $title; 00195 } 00196 00197 # Now try all upper case 00198 # 00199 $title = Title::newFromText( $wgContLang->uc( $term ) ); 00200 if ( $title && $title->exists() ) { 00201 return $title; 00202 } 00203 00204 # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc 00205 $title = Title::newFromText( $wgContLang->ucwordbreaks( $term ) ); 00206 if ( $title && $title->exists() ) { 00207 return $title; 00208 } 00209 00210 // Give hooks a chance at better match variants 00211 $title = null; 00212 if ( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) { 00213 return $title; 00214 } 00215 } 00216 00217 $title = Title::newFromText( $searchterm ); 00218 00219 00220 # Entering an IP address goes to the contributions page 00221 if ( $wgEnableSearchContributorsByIP ) { 00222 if ( ( $title->getNamespace() == NS_USER && User::isIP( $title->getText() ) ) 00223 || User::isIP( trim( $searchterm ) ) ) { 00224 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() ); 00225 } 00226 } 00227 00228 00229 # Entering a user goes to the user page whether it's there or not 00230 if ( $title->getNamespace() == NS_USER ) { 00231 return $title; 00232 } 00233 00234 # Go to images that exist even if there's no local page. 00235 # There may have been a funny upload, or it may be on a shared 00236 # file repository such as Wikimedia Commons. 00237 if ( $title->getNamespace() == NS_FILE ) { 00238 $image = wfFindFile( $title ); 00239 if ( $image ) { 00240 return $title; 00241 } 00242 } 00243 00244 # MediaWiki namespace? Page may be "implied" if not customized. 00245 # Just return it, with caps forced as the message system likes it. 00246 if ( $title->getNamespace() == NS_MEDIAWIKI ) { 00247 return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) ); 00248 } 00249 00250 # Quoted term? Try without the quotes... 00251 $matches = array(); 00252 if ( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) { 00253 return SearchEngine::getNearMatch( $matches[1] ); 00254 } 00255 00256 return null; 00257 } 00258 00259 public static function legalSearchChars() { 00260 return "A-Za-z_'.0-9\\x80-\\xFF\\-"; 00261 } 00262 00270 function setLimitOffset( $limit, $offset = 0 ) { 00271 $this->limit = intval( $limit ); 00272 $this->offset = intval( $offset ); 00273 } 00274 00281 function setNamespaces( $namespaces ) { 00282 $this->namespaces = $namespaces; 00283 } 00284 00291 function replacePrefixes( $query ) { 00292 global $wgContLang; 00293 00294 $parsed = $query; 00295 if ( strpos( $query, ':' ) === false ) { // nothing to do 00296 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) ); 00297 return $parsed; 00298 } 00299 00300 $allkeyword = wfMsgForContent( 'searchall' ) . ":"; 00301 if ( strncmp( $query, $allkeyword, strlen( $allkeyword ) ) == 0 ) { 00302 $this->namespaces = null; 00303 $parsed = substr( $query, strlen( $allkeyword ) ); 00304 } elseif ( strpos( $query, ':' ) !== false ) { 00305 $prefix = substr( $query, 0, strpos( $query, ':' ) ); 00306 $index = $wgContLang->getNsIndex( $prefix ); 00307 if ( $index !== false ) { 00308 $this->namespaces = array( $index ); 00309 $parsed = substr( $query, strlen( $prefix ) + 1 ); 00310 } 00311 } 00312 if ( trim( $parsed ) == '' ) 00313 $parsed = $query; // prefix was the whole query 00314 00315 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) ); 00316 00317 return $parsed; 00318 } 00319 00324 public static function searchableNamespaces() { 00325 global $wgContLang; 00326 $arr = array(); 00327 foreach ( $wgContLang->getNamespaces() as $ns => $name ) { 00328 if ( $ns >= NS_MAIN ) { 00329 $arr[$ns] = $name; 00330 } 00331 } 00332 00333 wfRunHooks( 'SearchableNamespaces', array( &$arr ) ); 00334 return $arr; 00335 } 00336 00344 public static function userNamespaces( $user ) { 00345 global $wgSearchEverythingOnlyLoggedIn; 00346 00347 $searchableNamespaces = SearchEngine::searchableNamespaces(); 00348 00349 // get search everything preference, that can be set to be read for logged-in users 00350 // it overrides other options 00351 if ( !$wgSearchEverythingOnlyLoggedIn || $user->isLoggedIn() ) { 00352 if ( $user->getOption( 'searcheverything' ) ) { 00353 return array_keys( $searchableNamespaces ); 00354 } 00355 } 00356 00357 $arr = array(); 00358 foreach ( $searchableNamespaces as $ns => $name ) { 00359 if ( $user->getOption( 'searchNs' . $ns ) ) { 00360 $arr[] = $ns; 00361 } 00362 } 00363 00364 return $arr; 00365 } 00366 00372 public static function userHighlightPrefs() { 00373 $contextlines = 2; // Hardcode this. Old defaults sucked. :) 00374 $contextchars = 75; // same as above.... :P 00375 return array( $contextlines, $contextchars ); 00376 } 00377 00383 public static function defaultNamespaces() { 00384 global $wgNamespacesToBeSearchedDefault; 00385 00386 return array_keys( $wgNamespacesToBeSearchedDefault, true ); 00387 } 00388 00395 public static function namespacesAsText( $namespaces ) { 00396 global $wgContLang; 00397 00398 $formatted = array_map( array( $wgContLang, 'getFormattedNsText' ), $namespaces ); 00399 foreach ( $formatted as $key => $ns ) { 00400 if ( empty( $ns ) ) 00401 $formatted[$key] = wfMsg( 'blanknamespace' ); 00402 } 00403 return $formatted; 00404 } 00405 00411 public static function helpNamespaces() { 00412 global $wgNamespacesToBeSearchedHelp; 00413 00414 return array_keys( $wgNamespacesToBeSearchedHelp, true ); 00415 } 00416 00423 function filter( $text ) { 00424 $lc = $this->legalSearchChars(); 00425 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) ); 00426 } 00433 public static function create() { 00434 global $wgSearchType; 00435 $dbr = null; 00436 if ( $wgSearchType ) { 00437 $class = $wgSearchType; 00438 } else { 00439 $dbr = wfGetDB( DB_SLAVE ); 00440 $class = $dbr->getSearchEngine(); 00441 } 00442 $search = new $class( $dbr ); 00443 $search->setLimitOffset( 0, 0 ); 00444 return $search; 00445 } 00446 00456 function update( $id, $title, $text ) { 00457 // no-op 00458 } 00459 00468 function updateTitle( $id, $title ) { 00469 // no-op 00470 } 00471 00477 public static function getOpenSearchTemplate() { 00478 global $wgOpenSearchTemplate, $wgCanonicalServer; 00479 if ( $wgOpenSearchTemplate ) { 00480 return $wgOpenSearchTemplate; 00481 } else { 00482 $ns = implode( '|', SearchEngine::defaultNamespaces() ); 00483 if ( !$ns ) { 00484 $ns = "0"; 00485 } 00486 return $wgCanonicalServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace=' . $ns; 00487 } 00488 } 00489 00495 public static function getMWSuggestTemplate() { 00496 global $wgMWSuggestTemplate, $wgServer; 00497 if ( $wgMWSuggestTemplate ) 00498 return $wgMWSuggestTemplate; 00499 else 00500 return $wgServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest'; 00501 } 00502 } 00503 00507 class SearchResultSet { 00515 function termMatches() { 00516 return array(); 00517 } 00518 00519 function numRows() { 00520 return 0; 00521 } 00522 00529 function hasResults() { 00530 return false; 00531 } 00532 00543 function getTotalHits() { 00544 return null; 00545 } 00546 00553 function hasSuggestion() { 00554 return false; 00555 } 00556 00560 function getSuggestionQuery() { 00561 return null; 00562 } 00563 00567 function getSuggestionSnippet() { 00568 return ''; 00569 } 00570 00577 function getInfo() { 00578 return null; 00579 } 00580 00586 function getInterwikiResults() { 00587 return null; 00588 } 00589 00595 function hasInterwikiResults() { 00596 return $this->getInterwikiResults() != null; 00597 } 00598 00605 function next() { 00606 return false; 00607 } 00608 00612 function free() { 00613 // ... 00614 } 00615 } 00616 00620 class SqlSearchResultSet extends SearchResultSet { 00621 00622 protected $mResultSet; 00623 00624 function __construct( $resultSet, $terms ) { 00625 $this->mResultSet = $resultSet; 00626 $this->mTerms = $terms; 00627 } 00628 00629 function termMatches() { 00630 return $this->mTerms; 00631 } 00632 00633 function numRows() { 00634 if ( $this->mResultSet === false ) 00635 return false; 00636 00637 return $this->mResultSet->numRows(); 00638 } 00639 00640 function next() { 00641 if ( $this->mResultSet === false ) 00642 return false; 00643 00644 $row = $this->mResultSet->fetchObject(); 00645 if ( $row === false ) 00646 return false; 00647 00648 return SearchResult::newFromRow( $row ); 00649 } 00650 00651 function free() { 00652 if ( $this->mResultSet === false ) 00653 return false; 00654 00655 $this->mResultSet->free(); 00656 } 00657 } 00658 00662 class SearchResultTooMany { 00663 # # Some search engines may bail out if too many matches are found 00664 } 00665 00666 00673 class SearchResult { 00674 00678 var $mRevision = null; 00679 var $mImage = null; 00680 00684 var $mTitle; 00685 00689 var $mText; 00690 00697 public static function newFromTitle( $title ) { 00698 $result = new self(); 00699 $result->initFromTitle( $title ); 00700 return $result; 00701 } 00708 public static function newFromRow( $row ) { 00709 $result = new self(); 00710 $result->initFromRow( $row ); 00711 return $result; 00712 } 00713 00714 public function __construct( $row = null ) { 00715 if ( !is_null( $row ) ) { 00716 // Backwards compatibility with pre-1.17 callers 00717 $this->initFromRow( $row ); 00718 } 00719 } 00720 00727 protected function initFromRow( $row ) { 00728 $this->initFromTitle( Title::makeTitle( $row->page_namespace, $row->page_title ) ); 00729 } 00730 00737 protected function initFromTitle( $title ) { 00738 $this->mTitle = $title; 00739 if ( !is_null( $this->mTitle ) ) { 00740 $this->mRevision = Revision::newFromTitle( $this->mTitle ); 00741 if ( $this->mTitle->getNamespace() === NS_FILE ) 00742 $this->mImage = wfFindFile( $this->mTitle ); 00743 } 00744 } 00745 00751 function isBrokenTitle() { 00752 if ( is_null( $this->mTitle ) ) 00753 return true; 00754 return false; 00755 } 00756 00762 function isMissingRevision() { 00763 return !$this->mRevision && !$this->mImage; 00764 } 00765 00769 function getTitle() { 00770 return $this->mTitle; 00771 } 00772 00776 function getScore() { 00777 return null; 00778 } 00779 00783 protected function initText() { 00784 if ( !isset( $this->mText ) ) { 00785 if ( $this->mRevision != null ) 00786 $this->mText = $this->mRevision->getText(); 00787 else // TODO: can we fetch raw wikitext for commons images? 00788 $this->mText = ''; 00789 00790 } 00791 } 00792 00797 function getTextSnippet( $terms ) { 00798 global $wgUser, $wgAdvancedSearchHighlighting; 00799 $this->initText(); 00800 list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs( $wgUser ); 00801 $h = new SearchHighlighter(); 00802 if ( $wgAdvancedSearchHighlighting ) 00803 return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars ); 00804 else 00805 return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars ); 00806 } 00807 00812 function getTitleSnippet( $terms ) { 00813 return ''; 00814 } 00815 00820 function getRedirectSnippet( $terms ) { 00821 return ''; 00822 } 00823 00827 function getRedirectTitle() { 00828 return null; 00829 } 00830 00834 function getSectionSnippet() { 00835 return ''; 00836 } 00837 00841 function getSectionTitle() { 00842 return null; 00843 } 00844 00848 function getTimestamp() { 00849 if ( $this->mRevision ) 00850 return $this->mRevision->getTimestamp(); 00851 elseif ( $this->mImage ) 00852 return $this->mImage->getTimestamp(); 00853 return ''; 00854 } 00855 00859 function getWordCount() { 00860 $this->initText(); 00861 return str_word_count( $this->mText ); 00862 } 00863 00867 function getByteSize() { 00868 $this->initText(); 00869 return strlen( $this->mText ); 00870 } 00871 00875 function hasRelated() { 00876 return false; 00877 } 00878 00882 function getInterwikiPrefix() { 00883 return ''; 00884 } 00885 } 00889 class SearchNearMatchResultSet extends SearchResultSet { 00890 private $fetched = false; 00894 public function __construct( $match ) { 00895 $this->result = $match; 00896 } 00897 public function hasResult() { 00898 return (bool)$this->result; 00899 } 00900 public function numRows() { 00901 return $this->hasResults() ? 1 : 0; 00902 } 00903 public function next() { 00904 if ( $this->fetched || !$this->result ) { 00905 return false; 00906 } 00907 $this->fetched = true; 00908 return SearchResult::newFromTitle( $this->result ); 00909 } 00910 } 00911 00917 class SearchHighlighter { 00918 var $mCleanWikitext = true; 00919 00920 function __construct( $cleanupWikitext = true ) { 00921 $this->mCleanWikitext = $cleanupWikitext; 00922 } 00923 00933 public function highlightText( $text, $terms, $contextlines, $contextchars ) { 00934 global $wgContLang; 00935 global $wgSearchHighlightBoundaries; 00936 $fname = __METHOD__; 00937 00938 if ( $text == '' ) 00939 return ''; 00940 00941 // spli text into text + templates/links/tables 00942 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)"; 00943 // first capture group is for detecting nested templates/links/tables/references 00944 $endPatterns = array( 00945 1 => '/(\{\{)|(\}\})/', // template 00946 2 => '/(\[\[)|(\]\])/', // image 00947 3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table 00948 00949 // @todo FIXME: This should prolly be a hook or something 00950 if ( function_exists( 'wfCite' ) ) { 00951 $spat .= '|(<ref>)'; // references via cite extension 00952 $endPatterns[4] = '/(<ref>)|(<\/ref>)/'; 00953 } 00954 $spat .= '/'; 00955 $textExt = array(); // text extracts 00956 $otherExt = array(); // other extracts 00957 wfProfileIn( "$fname-split" ); 00958 $start = 0; 00959 $textLen = strlen( $text ); 00960 $count = 0; // sequence number to maintain ordering 00961 while ( $start < $textLen ) { 00962 // find start of template/image/table 00963 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) { 00964 $epat = ''; 00965 foreach ( $matches as $key => $val ) { 00966 if ( $key > 0 && $val[1] != - 1 ) { 00967 if ( $key == 2 ) { 00968 // see if this is an image link 00969 $ns = substr( $val[0], 2, - 1 ); 00970 if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) 00971 break; 00972 00973 } 00974 $epat = $endPatterns[$key]; 00975 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) ); 00976 $start = $val[1]; 00977 break; 00978 } 00979 } 00980 if ( $epat ) { 00981 // find end (and detect any nested elements) 00982 $level = 0; 00983 $offset = $start + 1; 00984 $found = false; 00985 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) { 00986 if ( array_key_exists( 2, $endMatches ) ) { 00987 // found end 00988 if ( $level == 0 ) { 00989 $len = strlen( $endMatches[2][0] ); 00990 $off = $endMatches[2][1]; 00991 $this->splitAndAdd( $otherExt, $count, 00992 substr( $text, $start, $off + $len - $start ) ); 00993 $start = $off + $len; 00994 $found = true; 00995 break; 00996 } else { 00997 // end of nested element 00998 $level -= 1; 00999 } 01000 } else { 01001 // nested 01002 $level += 1; 01003 } 01004 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] ); 01005 } 01006 if ( ! $found ) { 01007 // couldn't find appropriate closing tag, skip 01008 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) ); 01009 $start += strlen( $matches[0][0] ); 01010 } 01011 continue; 01012 } 01013 } 01014 // else: add as text extract 01015 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) ); 01016 break; 01017 } 01018 01019 $all = $textExt + $otherExt; // these have disjunct key sets 01020 01021 wfProfileOut( "$fname-split" ); 01022 01023 // prepare regexps 01024 foreach ( $terms as $index => $term ) { 01025 // manually do upper/lowercase stuff for utf-8 since PHP won't do it 01026 if ( preg_match( '/[\x80-\xff]/', $term ) ) { 01027 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] ); 01028 } else { 01029 $terms[$index] = $term; 01030 } 01031 } 01032 $anyterm = implode( '|', $terms ); 01033 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms ); 01034 01035 // @todo FIXME: A hack to scale contextchars, a correct solution 01036 // would be to have contextchars actually be char and not byte 01037 // length, and do proper utf-8 substrings and lengths everywhere, 01038 // but PHP is making that very hard and unclean to implement :( 01039 $scale = strlen( $anyterm ) / mb_strlen( $anyterm ); 01040 $contextchars = intval( $contextchars * $scale ); 01041 01042 $patPre = "(^|$wgSearchHighlightBoundaries)"; 01043 $patPost = "($wgSearchHighlightBoundaries|$)"; 01044 01045 $pat1 = "/(" . $phrase . ")/ui"; 01046 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui"; 01047 01048 wfProfileIn( "$fname-extract" ); 01049 01050 $left = $contextlines; 01051 01052 $snippets = array(); 01053 $offsets = array(); 01054 01055 // show beginning only if it contains all words 01056 $first = 0; 01057 $firstText = ''; 01058 foreach ( $textExt as $index => $line ) { 01059 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) { 01060 $firstText = $this->extract( $line, 0, $contextchars * $contextlines ); 01061 $first = $index; 01062 break; 01063 } 01064 } 01065 if ( $firstText ) { 01066 $succ = true; 01067 // check if first text contains all terms 01068 foreach ( $terms as $term ) { 01069 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) { 01070 $succ = false; 01071 break; 01072 } 01073 } 01074 if ( $succ ) { 01075 $snippets[$first] = $firstText; 01076 $offsets[$first] = 0; 01077 } 01078 } 01079 if ( ! $snippets ) { 01080 // match whole query on text 01081 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets ); 01082 // match whole query on templates/tables/images 01083 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets ); 01084 // match any words on text 01085 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets ); 01086 // match any words on templates/tables/images 01087 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets ); 01088 01089 ksort( $snippets ); 01090 } 01091 01092 // add extra chars to each snippet to make snippets constant size 01093 $extended = array(); 01094 if ( count( $snippets ) == 0 ) { 01095 // couldn't find the target words, just show beginning of article 01096 if ( array_key_exists( $first, $all ) ) { 01097 $targetchars = $contextchars * $contextlines; 01098 $snippets[$first] = ''; 01099 $offsets[$first] = 0; 01100 } 01101 } else { 01102 // if begin of the article contains the whole phrase, show only that !! 01103 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] ) 01104 && $offsets[$first] < $contextchars * 2 ) { 01105 $snippets = array ( $first => $snippets[$first] ); 01106 } 01107 01108 // calc by how much to extend existing snippets 01109 $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) ); 01110 } 01111 01112 foreach ( $snippets as $index => $line ) { 01113 $extended[$index] = $line; 01114 $len = strlen( $line ); 01115 if ( $len < $targetchars - 20 ) { 01116 // complete this line 01117 if ( $len < strlen( $all[$index] ) ) { 01118 $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] ); 01119 $len = strlen( $extended[$index] ); 01120 } 01121 01122 // add more lines 01123 $add = $index + 1; 01124 while ( $len < $targetchars - 20 01125 && array_key_exists( $add, $all ) 01126 && !array_key_exists( $add, $snippets ) ) { 01127 $offsets[$add] = 0; 01128 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] ); 01129 $extended[$add] = $tt; 01130 $len += strlen( $tt ); 01131 $add++; 01132 } 01133 } 01134 } 01135 01136 // $snippets = array_map('htmlspecialchars', $extended); 01137 $snippets = $extended; 01138 $last = - 1; 01139 $extract = ''; 01140 foreach ( $snippets as $index => $line ) { 01141 if ( $last == - 1 ) 01142 $extract .= $line; // first line 01143 elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) ) 01144 $extract .= " " . $line; // continous lines 01145 else 01146 $extract .= '<b> ... </b>' . $line; 01147 01148 $last = $index; 01149 } 01150 if ( $extract ) 01151 $extract .= '<b> ... </b>'; 01152 01153 $processed = array(); 01154 foreach ( $terms as $term ) { 01155 if ( ! isset( $processed[$term] ) ) { 01156 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word 01157 $extract = preg_replace( $pat3, 01158 "\\1<span class='searchmatch'>\\2</span>\\3", $extract ); 01159 $processed[$term] = true; 01160 } 01161 } 01162 01163 wfProfileOut( "$fname-extract" ); 01164 01165 return $extract; 01166 } 01167 01175 function splitAndAdd( &$extracts, &$count, $text ) { 01176 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text ); 01177 foreach ( $split as $line ) { 01178 $tt = trim( $line ); 01179 if ( $tt ) 01180 $extracts[$count++] = $tt; 01181 } 01182 } 01183 01189 function caseCallback( $matches ) { 01190 global $wgContLang; 01191 if ( strlen( $matches[0] ) > 1 ) { 01192 return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']'; 01193 } else { 01194 return $matches[0]; 01195 } 01196 } 01197 01208 function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) { 01209 if ( $start != 0 ) { 01210 $start = $this->position( $text, $start, 1 ); 01211 } 01212 if ( $end >= strlen( $text ) ) { 01213 $end = strlen( $text ); 01214 } else { 01215 $end = $this->position( $text, $end ); 01216 } 01217 01218 if ( !is_null( $posStart ) ) { 01219 $posStart = $start; 01220 } 01221 if ( !is_null( $posEnd ) ) { 01222 $posEnd = $end; 01223 } 01224 01225 if ( $end > $start ) { 01226 return substr( $text, $start, $end - $start ); 01227 } else { 01228 return ''; 01229 } 01230 } 01231 01240 function position( $text, $point, $offset = 0 ) { 01241 $tolerance = 10; 01242 $s = max( 0, $point - $tolerance ); 01243 $l = min( strlen( $text ), $point + $tolerance ) - $s; 01244 $m = array(); 01245 if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) { 01246 return $m[0][1] + $s + $offset; 01247 } else { 01248 // check if point is on a valid first UTF8 char 01249 $char = ord( $text[$point] ); 01250 while ( $char >= 0x80 && $char < 0xc0 ) { 01251 // skip trailing bytes 01252 $point++; 01253 if ( $point >= strlen( $text ) ) 01254 return strlen( $text ); 01255 $char = ord( $text[$point] ); 01256 } 01257 return $point; 01258 01259 } 01260 } 01261 01273 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) { 01274 if ( $linesleft == 0 ) 01275 return; // nothing to do 01276 foreach ( $extracts as $index => $line ) { 01277 if ( array_key_exists( $index, $out ) ) 01278 continue; // this line already highlighted 01279 01280 $m = array(); 01281 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) 01282 continue; 01283 01284 $offset = $m[0][1]; 01285 $len = strlen( $m[0][0] ); 01286 if ( $offset + $len < $contextchars ) 01287 $begin = 0; 01288 elseif ( $len > $contextchars ) 01289 $begin = $offset; 01290 else 01291 $begin = $offset + intval( ( $len - $contextchars ) / 2 ); 01292 01293 $end = $begin + $contextchars; 01294 01295 $posBegin = $begin; 01296 // basic snippet from this line 01297 $out[$index] = $this->extract( $line, $begin, $end, $posBegin ); 01298 $offsets[$index] = $posBegin; 01299 $linesleft--; 01300 if ( $linesleft == 0 ) 01301 return; 01302 } 01303 } 01304 01309 function removeWiki( $text ) { 01310 $fname = __METHOD__; 01311 wfProfileIn( $fname ); 01312 01313 // $text = preg_replace("/'{2,5}/", "", $text); 01314 // $text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text); 01315 // $text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text); 01316 // $text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text); 01317 // $text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text); 01318 // $text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text); 01319 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text ); 01320 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text ); 01321 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text ); 01322 $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text ); 01323 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text); 01324 $text = preg_replace( "/<\/?[^>]+>/", "", $text ); 01325 $text = preg_replace( "/'''''/", "", $text ); 01326 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text ); 01327 $text = preg_replace( "/''/", "", $text ); 01328 01329 wfProfileOut( $fname ); 01330 return $text; 01331 } 01332 01339 function linkReplace( $matches ) { 01340 $colon = strpos( $matches[1], ':' ); 01341 if ( $colon === false ) 01342 return $matches[2]; // replace with caption 01343 global $wgContLang; 01344 $ns = substr( $matches[1], 0, $colon ); 01345 $index = $wgContLang->getNsIndex( $ns ); 01346 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) 01347 return $matches[0]; // return the whole thing 01348 else 01349 return $matches[2]; 01350 01351 } 01352 01363 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) { 01364 global $wgContLang; 01365 $fname = __METHOD__; 01366 01367 $lines = explode( "\n", $text ); 01368 01369 $terms = implode( '|', $terms ); 01370 $max = intval( $contextchars ) + 1; 01371 $pat1 = "/(.*)($terms)(.{0,$max})/i"; 01372 01373 $lineno = 0; 01374 01375 $extract = ""; 01376 wfProfileIn( "$fname-extract" ); 01377 foreach ( $lines as $line ) { 01378 if ( 0 == $contextlines ) { 01379 break; 01380 } 01381 ++$lineno; 01382 $m = array(); 01383 if ( ! preg_match( $pat1, $line, $m ) ) { 01384 continue; 01385 } 01386 --$contextlines; 01387 // truncate function changes ... to relevant i18n message. 01388 $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false ); 01389 01390 if ( count( $m ) < 3 ) { 01391 $post = ''; 01392 } else { 01393 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false ); 01394 } 01395 01396 $found = $m[2]; 01397 01398 $line = htmlspecialchars( $pre . $found . $post ); 01399 $pat2 = '/(' . $terms . ")/i"; 01400 $line = preg_replace( $pat2, 01401 "<span class='searchmatch'>\\1</span>", $line ); 01402 01403 $extract .= "${line}\n"; 01404 } 01405 wfProfileOut( "$fname-extract" ); 01406 01407 return $extract; 01408 } 01409 01410 } 01411 01418 class SearchEngineDummy extends SearchEngine { 01419 // no-op 01420 }