MediaWiki
REL1_20
|
00001 <?php 00032 class SearchEngine { 00033 var $limit = 10; 00034 var $offset = 0; 00035 var $prefix = ''; 00036 var $searchTerms = array(); 00037 var $namespaces = array( NS_MAIN ); 00038 var $showRedirects = false; 00039 00041 protected $features = array(); 00042 00046 protected $db; 00047 00048 function __construct($db = null) { 00049 if ( $db ) { 00050 $this->db = $db; 00051 } else { 00052 $this->db = wfGetDB( DB_SLAVE ); 00053 } 00054 } 00055 00064 function searchText( $term ) { 00065 return null; 00066 } 00067 00076 function searchTitle( $term ) { 00077 return null; 00078 } 00079 00085 function acceptListRedirects() { 00086 wfDeprecated( __METHOD__, '1.18' ); 00087 return $this->supports( 'list-redirects' ); 00088 } 00089 00095 public function supports( $feature ) { 00096 switch( $feature ) { 00097 case 'list-redirects': 00098 return true; 00099 case 'title-suffix-filter': 00100 default: 00101 return false; 00102 } 00103 } 00104 00112 public function setFeatureData( $feature, $data ) { 00113 $this->features[$feature] = $data; 00114 } 00115 00124 public function normalizeText( $string ) { 00125 global $wgContLang; 00126 00127 // Some languages such as Chinese require word segmentation 00128 return $wgContLang->segmentByWord( $string ); 00129 } 00130 00135 function transformSearchTerm( $term ) { 00136 return $term; 00137 } 00138 00146 public static function getNearMatch( $searchterm ) { 00147 $title = self::getNearMatchInternal( $searchterm ); 00148 00149 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) ); 00150 return $title; 00151 } 00152 00160 public static function getNearMatchResultSet( $searchterm ) { 00161 return new SearchNearMatchResultSet( self::getNearMatch( $searchterm ) ); 00162 } 00163 00168 private static function getNearMatchInternal( $searchterm ) { 00169 global $wgContLang, $wgEnableSearchContributorsByIP; 00170 00171 $allSearchTerms = array( $searchterm ); 00172 00173 if ( $wgContLang->hasVariants() ) { 00174 $allSearchTerms = array_merge( $allSearchTerms, $wgContLang->autoConvertToAllVariants( $searchterm ) ); 00175 } 00176 00177 $titleResult = null; 00178 if ( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) { 00179 return $titleResult; 00180 } 00181 00182 foreach ( $allSearchTerms as $term ) { 00183 00184 # Exact match? No need to look further. 00185 $title = Title::newFromText( $term ); 00186 if ( is_null( $title ) ){ 00187 return null; 00188 } 00189 00190 if ( $title->isSpecialPage() || $title->isExternal() || $title->exists() ) { 00191 return $title; 00192 } 00193 00194 # See if it still otherwise has content is some sane sense 00195 $page = WikiPage::factory( $title ); 00196 if ( $page->hasViewableContent() ) { 00197 return $title; 00198 } 00199 00200 # Now try all lower case (i.e. first letter capitalized) 00201 # 00202 $title = Title::newFromText( $wgContLang->lc( $term ) ); 00203 if ( $title && $title->exists() ) { 00204 return $title; 00205 } 00206 00207 # Now try capitalized string 00208 # 00209 $title = Title::newFromText( $wgContLang->ucwords( $term ) ); 00210 if ( $title && $title->exists() ) { 00211 return $title; 00212 } 00213 00214 # Now try all upper case 00215 # 00216 $title = Title::newFromText( $wgContLang->uc( $term ) ); 00217 if ( $title && $title->exists() ) { 00218 return $title; 00219 } 00220 00221 # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc 00222 $title = Title::newFromText( $wgContLang->ucwordbreaks( $term ) ); 00223 if ( $title && $title->exists() ) { 00224 return $title; 00225 } 00226 00227 // Give hooks a chance at better match variants 00228 $title = null; 00229 if ( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) { 00230 return $title; 00231 } 00232 } 00233 00234 $title = Title::newFromText( $searchterm ); 00235 00236 00237 # Entering an IP address goes to the contributions page 00238 if ( $wgEnableSearchContributorsByIP ) { 00239 if ( ( $title->getNamespace() == NS_USER && User::isIP( $title->getText() ) ) 00240 || User::isIP( trim( $searchterm ) ) ) { 00241 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() ); 00242 } 00243 } 00244 00245 00246 # Entering a user goes to the user page whether it's there or not 00247 if ( $title->getNamespace() == NS_USER ) { 00248 return $title; 00249 } 00250 00251 # Go to images that exist even if there's no local page. 00252 # There may have been a funny upload, or it may be on a shared 00253 # file repository such as Wikimedia Commons. 00254 if ( $title->getNamespace() == NS_FILE ) { 00255 $image = wfFindFile( $title ); 00256 if ( $image ) { 00257 return $title; 00258 } 00259 } 00260 00261 # MediaWiki namespace? Page may be "implied" if not customized. 00262 # Just return it, with caps forced as the message system likes it. 00263 if ( $title->getNamespace() == NS_MEDIAWIKI ) { 00264 return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) ); 00265 } 00266 00267 # Quoted term? Try without the quotes... 00268 $matches = array(); 00269 if ( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) { 00270 return SearchEngine::getNearMatch( $matches[1] ); 00271 } 00272 00273 return null; 00274 } 00275 00276 public static function legalSearchChars() { 00277 return "A-Za-z_'.0-9\\x80-\\xFF\\-"; 00278 } 00279 00287 function setLimitOffset( $limit, $offset = 0 ) { 00288 $this->limit = intval( $limit ); 00289 $this->offset = intval( $offset ); 00290 } 00291 00298 function setNamespaces( $namespaces ) { 00299 $this->namespaces = $namespaces; 00300 } 00301 00309 function replacePrefixes( $query ) { 00310 global $wgContLang; 00311 00312 $parsed = $query; 00313 if ( strpos( $query, ':' ) === false ) { // nothing to do 00314 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) ); 00315 return $parsed; 00316 } 00317 00318 $allkeyword = wfMessage( 'searchall' )->inContentLanguage()->text() . ":"; 00319 if ( strncmp( $query, $allkeyword, strlen( $allkeyword ) ) == 0 ) { 00320 $this->namespaces = null; 00321 $parsed = substr( $query, strlen( $allkeyword ) ); 00322 } elseif ( strpos( $query, ':' ) !== false ) { 00323 $prefix = substr( $query, 0, strpos( $query, ':' ) ); 00324 $index = $wgContLang->getNsIndex( $prefix ); 00325 if ( $index !== false ) { 00326 $this->namespaces = array( $index ); 00327 $parsed = substr( $query, strlen( $prefix ) + 1 ); 00328 } 00329 } 00330 if ( trim( $parsed ) == '' ) 00331 $parsed = $query; // prefix was the whole query 00332 00333 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) ); 00334 00335 return $parsed; 00336 } 00337 00342 public static function searchableNamespaces() { 00343 global $wgContLang; 00344 $arr = array(); 00345 foreach ( $wgContLang->getNamespaces() as $ns => $name ) { 00346 if ( $ns >= NS_MAIN ) { 00347 $arr[$ns] = $name; 00348 } 00349 } 00350 00351 wfRunHooks( 'SearchableNamespaces', array( &$arr ) ); 00352 return $arr; 00353 } 00354 00362 public static function userNamespaces( $user ) { 00363 global $wgSearchEverythingOnlyLoggedIn; 00364 00365 $searchableNamespaces = SearchEngine::searchableNamespaces(); 00366 00367 // get search everything preference, that can be set to be read for logged-in users 00368 // it overrides other options 00369 if ( !$wgSearchEverythingOnlyLoggedIn || $user->isLoggedIn() ) { 00370 if ( $user->getOption( 'searcheverything' ) ) { 00371 return array_keys( $searchableNamespaces ); 00372 } 00373 } 00374 00375 $arr = array(); 00376 foreach ( $searchableNamespaces as $ns => $name ) { 00377 if ( $user->getOption( 'searchNs' . $ns ) ) { 00378 $arr[] = $ns; 00379 } 00380 } 00381 00382 return $arr; 00383 } 00384 00390 public static function userHighlightPrefs() { 00391 $contextlines = 2; // Hardcode this. Old defaults sucked. :) 00392 $contextchars = 75; // same as above.... :P 00393 return array( $contextlines, $contextchars ); 00394 } 00395 00401 public static function defaultNamespaces() { 00402 global $wgNamespacesToBeSearchedDefault; 00403 00404 return array_keys( $wgNamespacesToBeSearchedDefault, true ); 00405 } 00406 00414 public static function namespacesAsText( $namespaces ) { 00415 global $wgContLang; 00416 00417 $formatted = array_map( array( $wgContLang, 'getFormattedNsText' ), $namespaces ); 00418 foreach ( $formatted as $key => $ns ) { 00419 if ( empty( $ns ) ) 00420 $formatted[$key] = wfMessage( 'blanknamespace' )->text(); 00421 } 00422 return $formatted; 00423 } 00424 00430 public static function helpNamespaces() { 00431 global $wgNamespacesToBeSearchedHelp; 00432 00433 return array_keys( $wgNamespacesToBeSearchedHelp, true ); 00434 } 00435 00442 function filter( $text ) { 00443 $lc = $this->legalSearchChars(); 00444 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) ); 00445 } 00452 public static function create() { 00453 global $wgSearchType; 00454 $dbr = null; 00455 if ( $wgSearchType ) { 00456 $class = $wgSearchType; 00457 } else { 00458 $dbr = wfGetDB( DB_SLAVE ); 00459 $class = $dbr->getSearchEngine(); 00460 } 00461 $search = new $class( $dbr ); 00462 $search->setLimitOffset( 0, 0 ); 00463 return $search; 00464 } 00465 00475 function update( $id, $title, $text ) { 00476 // no-op 00477 } 00478 00487 function updateTitle( $id, $title ) { 00488 // no-op 00489 } 00490 00496 public static function getOpenSearchTemplate() { 00497 global $wgOpenSearchTemplate, $wgCanonicalServer; 00498 if ( $wgOpenSearchTemplate ) { 00499 return $wgOpenSearchTemplate; 00500 } else { 00501 $ns = implode( '|', SearchEngine::defaultNamespaces() ); 00502 if ( !$ns ) { 00503 $ns = "0"; 00504 } 00505 return $wgCanonicalServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace=' . $ns; 00506 } 00507 } 00508 } 00509 00513 class SearchResultSet { 00521 function termMatches() { 00522 return array(); 00523 } 00524 00525 function numRows() { 00526 return 0; 00527 } 00528 00535 function hasResults() { 00536 return false; 00537 } 00538 00549 function getTotalHits() { 00550 return null; 00551 } 00552 00559 function hasSuggestion() { 00560 return false; 00561 } 00562 00566 function getSuggestionQuery() { 00567 return null; 00568 } 00569 00573 function getSuggestionSnippet() { 00574 return ''; 00575 } 00576 00583 function getInfo() { 00584 return null; 00585 } 00586 00592 function getInterwikiResults() { 00593 return null; 00594 } 00595 00601 function hasInterwikiResults() { 00602 return $this->getInterwikiResults() != null; 00603 } 00604 00611 function next() { 00612 return false; 00613 } 00614 00618 function free() { 00619 // ... 00620 } 00621 } 00622 00626 class SqlSearchResultSet extends SearchResultSet { 00627 00628 protected $mResultSet; 00629 00630 function __construct( $resultSet, $terms ) { 00631 $this->mResultSet = $resultSet; 00632 $this->mTerms = $terms; 00633 } 00634 00635 function termMatches() { 00636 return $this->mTerms; 00637 } 00638 00639 function numRows() { 00640 if ( $this->mResultSet === false ) 00641 return false; 00642 00643 return $this->mResultSet->numRows(); 00644 } 00645 00646 function next() { 00647 if ( $this->mResultSet === false ) 00648 return false; 00649 00650 $row = $this->mResultSet->fetchObject(); 00651 if ( $row === false ) 00652 return false; 00653 00654 return SearchResult::newFromRow( $row ); 00655 } 00656 00657 function free() { 00658 if ( $this->mResultSet === false ) 00659 return false; 00660 00661 $this->mResultSet->free(); 00662 } 00663 } 00664 00668 class SearchResultTooMany { 00669 # # Some search engines may bail out if too many matches are found 00670 } 00671 00672 00679 class SearchResult { 00680 00684 var $mRevision = null; 00685 var $mImage = null; 00686 00690 var $mTitle; 00691 00695 var $mText; 00696 00703 public static function newFromTitle( $title ) { 00704 $result = new self(); 00705 $result->initFromTitle( $title ); 00706 return $result; 00707 } 00714 public static function newFromRow( $row ) { 00715 $result = new self(); 00716 $result->initFromRow( $row ); 00717 return $result; 00718 } 00719 00720 public function __construct( $row = null ) { 00721 if ( !is_null( $row ) ) { 00722 // Backwards compatibility with pre-1.17 callers 00723 $this->initFromRow( $row ); 00724 } 00725 } 00726 00733 protected function initFromRow( $row ) { 00734 $this->initFromTitle( Title::makeTitle( $row->page_namespace, $row->page_title ) ); 00735 } 00736 00743 protected function initFromTitle( $title ) { 00744 $this->mTitle = $title; 00745 if ( !is_null( $this->mTitle ) ) { 00746 $id = false; 00747 wfRunHooks( 'SearchResultInitFromTitle', array( $title, &$id ) ); 00748 $this->mRevision = Revision::newFromTitle( 00749 $this->mTitle, $id, Revision::READ_NORMAL ); 00750 if ( $this->mTitle->getNamespace() === NS_FILE ) 00751 $this->mImage = wfFindFile( $this->mTitle ); 00752 } 00753 } 00754 00760 function isBrokenTitle() { 00761 if ( is_null( $this->mTitle ) ) 00762 return true; 00763 return false; 00764 } 00765 00771 function isMissingRevision() { 00772 return !$this->mRevision && !$this->mImage; 00773 } 00774 00778 function getTitle() { 00779 return $this->mTitle; 00780 } 00781 00785 function getScore() { 00786 return null; 00787 } 00788 00792 protected function initText() { 00793 if ( !isset( $this->mText ) ) { 00794 if ( $this->mRevision != null ) 00795 $this->mText = $this->mRevision->getText(); 00796 else // TODO: can we fetch raw wikitext for commons images? 00797 $this->mText = ''; 00798 00799 } 00800 } 00801 00806 function getTextSnippet( $terms ) { 00807 global $wgUser, $wgAdvancedSearchHighlighting; 00808 $this->initText(); 00809 list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs( $wgUser ); 00810 $h = new SearchHighlighter(); 00811 if ( $wgAdvancedSearchHighlighting ) 00812 return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars ); 00813 else 00814 return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars ); 00815 } 00816 00821 function getTitleSnippet( $terms ) { 00822 return ''; 00823 } 00824 00829 function getRedirectSnippet( $terms ) { 00830 return ''; 00831 } 00832 00836 function getRedirectTitle() { 00837 return null; 00838 } 00839 00843 function getSectionSnippet() { 00844 return ''; 00845 } 00846 00850 function getSectionTitle() { 00851 return null; 00852 } 00853 00857 function getTimestamp() { 00858 if ( $this->mRevision ) 00859 return $this->mRevision->getTimestamp(); 00860 elseif ( $this->mImage ) 00861 return $this->mImage->getTimestamp(); 00862 return ''; 00863 } 00864 00868 function getWordCount() { 00869 $this->initText(); 00870 return str_word_count( $this->mText ); 00871 } 00872 00876 function getByteSize() { 00877 $this->initText(); 00878 return strlen( $this->mText ); 00879 } 00880 00884 function hasRelated() { 00885 return false; 00886 } 00887 00891 function getInterwikiPrefix() { 00892 return ''; 00893 } 00894 } 00898 class SearchNearMatchResultSet extends SearchResultSet { 00899 private $fetched = false; 00903 public function __construct( $match ) { 00904 $this->result = $match; 00905 } 00906 public function hasResult() { 00907 return (bool)$this->result; 00908 } 00909 public function numRows() { 00910 return $this->hasResults() ? 1 : 0; 00911 } 00912 public function next() { 00913 if ( $this->fetched || !$this->result ) { 00914 return false; 00915 } 00916 $this->fetched = true; 00917 return SearchResult::newFromTitle( $this->result ); 00918 } 00919 } 00920 00926 class SearchHighlighter { 00927 var $mCleanWikitext = true; 00928 00929 function __construct( $cleanupWikitext = true ) { 00930 $this->mCleanWikitext = $cleanupWikitext; 00931 } 00932 00942 public function highlightText( $text, $terms, $contextlines, $contextchars ) { 00943 global $wgContLang; 00944 global $wgSearchHighlightBoundaries; 00945 $fname = __METHOD__; 00946 00947 if ( $text == '' ) 00948 return ''; 00949 00950 // spli text into text + templates/links/tables 00951 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)"; 00952 // first capture group is for detecting nested templates/links/tables/references 00953 $endPatterns = array( 00954 1 => '/(\{\{)|(\}\})/', // template 00955 2 => '/(\[\[)|(\]\])/', // image 00956 3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table 00957 00958 // @todo FIXME: This should prolly be a hook or something 00959 if ( function_exists( 'wfCite' ) ) { 00960 $spat .= '|(<ref>)'; // references via cite extension 00961 $endPatterns[4] = '/(<ref>)|(<\/ref>)/'; 00962 } 00963 $spat .= '/'; 00964 $textExt = array(); // text extracts 00965 $otherExt = array(); // other extracts 00966 wfProfileIn( "$fname-split" ); 00967 $start = 0; 00968 $textLen = strlen( $text ); 00969 $count = 0; // sequence number to maintain ordering 00970 while ( $start < $textLen ) { 00971 // find start of template/image/table 00972 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) { 00973 $epat = ''; 00974 foreach ( $matches as $key => $val ) { 00975 if ( $key > 0 && $val[1] != - 1 ) { 00976 if ( $key == 2 ) { 00977 // see if this is an image link 00978 $ns = substr( $val[0], 2, - 1 ); 00979 if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) 00980 break; 00981 00982 } 00983 $epat = $endPatterns[$key]; 00984 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) ); 00985 $start = $val[1]; 00986 break; 00987 } 00988 } 00989 if ( $epat ) { 00990 // find end (and detect any nested elements) 00991 $level = 0; 00992 $offset = $start + 1; 00993 $found = false; 00994 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) { 00995 if ( array_key_exists( 2, $endMatches ) ) { 00996 // found end 00997 if ( $level == 0 ) { 00998 $len = strlen( $endMatches[2][0] ); 00999 $off = $endMatches[2][1]; 01000 $this->splitAndAdd( $otherExt, $count, 01001 substr( $text, $start, $off + $len - $start ) ); 01002 $start = $off + $len; 01003 $found = true; 01004 break; 01005 } else { 01006 // end of nested element 01007 $level -= 1; 01008 } 01009 } else { 01010 // nested 01011 $level += 1; 01012 } 01013 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] ); 01014 } 01015 if ( ! $found ) { 01016 // couldn't find appropriate closing tag, skip 01017 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) ); 01018 $start += strlen( $matches[0][0] ); 01019 } 01020 continue; 01021 } 01022 } 01023 // else: add as text extract 01024 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) ); 01025 break; 01026 } 01027 01028 $all = $textExt + $otherExt; // these have disjunct key sets 01029 01030 wfProfileOut( "$fname-split" ); 01031 01032 // prepare regexps 01033 foreach ( $terms as $index => $term ) { 01034 // manually do upper/lowercase stuff for utf-8 since PHP won't do it 01035 if ( preg_match( '/[\x80-\xff]/', $term ) ) { 01036 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] ); 01037 } else { 01038 $terms[$index] = $term; 01039 } 01040 } 01041 $anyterm = implode( '|', $terms ); 01042 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms ); 01043 01044 // @todo FIXME: A hack to scale contextchars, a correct solution 01045 // would be to have contextchars actually be char and not byte 01046 // length, and do proper utf-8 substrings and lengths everywhere, 01047 // but PHP is making that very hard and unclean to implement :( 01048 $scale = strlen( $anyterm ) / mb_strlen( $anyterm ); 01049 $contextchars = intval( $contextchars * $scale ); 01050 01051 $patPre = "(^|$wgSearchHighlightBoundaries)"; 01052 $patPost = "($wgSearchHighlightBoundaries|$)"; 01053 01054 $pat1 = "/(" . $phrase . ")/ui"; 01055 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui"; 01056 01057 wfProfileIn( "$fname-extract" ); 01058 01059 $left = $contextlines; 01060 01061 $snippets = array(); 01062 $offsets = array(); 01063 01064 // show beginning only if it contains all words 01065 $first = 0; 01066 $firstText = ''; 01067 foreach ( $textExt as $index => $line ) { 01068 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) { 01069 $firstText = $this->extract( $line, 0, $contextchars * $contextlines ); 01070 $first = $index; 01071 break; 01072 } 01073 } 01074 if ( $firstText ) { 01075 $succ = true; 01076 // check if first text contains all terms 01077 foreach ( $terms as $term ) { 01078 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) { 01079 $succ = false; 01080 break; 01081 } 01082 } 01083 if ( $succ ) { 01084 $snippets[$first] = $firstText; 01085 $offsets[$first] = 0; 01086 } 01087 } 01088 if ( ! $snippets ) { 01089 // match whole query on text 01090 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets ); 01091 // match whole query on templates/tables/images 01092 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets ); 01093 // match any words on text 01094 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets ); 01095 // match any words on templates/tables/images 01096 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets ); 01097 01098 ksort( $snippets ); 01099 } 01100 01101 // add extra chars to each snippet to make snippets constant size 01102 $extended = array(); 01103 if ( count( $snippets ) == 0 ) { 01104 // couldn't find the target words, just show beginning of article 01105 if ( array_key_exists( $first, $all ) ) { 01106 $targetchars = $contextchars * $contextlines; 01107 $snippets[$first] = ''; 01108 $offsets[$first] = 0; 01109 } 01110 } else { 01111 // if begin of the article contains the whole phrase, show only that !! 01112 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] ) 01113 && $offsets[$first] < $contextchars * 2 ) { 01114 $snippets = array ( $first => $snippets[$first] ); 01115 } 01116 01117 // calc by how much to extend existing snippets 01118 $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) ); 01119 } 01120 01121 foreach ( $snippets as $index => $line ) { 01122 $extended[$index] = $line; 01123 $len = strlen( $line ); 01124 if ( $len < $targetchars - 20 ) { 01125 // complete this line 01126 if ( $len < strlen( $all[$index] ) ) { 01127 $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] ); 01128 $len = strlen( $extended[$index] ); 01129 } 01130 01131 // add more lines 01132 $add = $index + 1; 01133 while ( $len < $targetchars - 20 01134 && array_key_exists( $add, $all ) 01135 && !array_key_exists( $add, $snippets ) ) { 01136 $offsets[$add] = 0; 01137 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] ); 01138 $extended[$add] = $tt; 01139 $len += strlen( $tt ); 01140 $add++; 01141 } 01142 } 01143 } 01144 01145 // $snippets = array_map('htmlspecialchars', $extended); 01146 $snippets = $extended; 01147 $last = - 1; 01148 $extract = ''; 01149 foreach ( $snippets as $index => $line ) { 01150 if ( $last == - 1 ) 01151 $extract .= $line; // first line 01152 elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) ) 01153 $extract .= " " . $line; // continous lines 01154 else 01155 $extract .= '<b> ... </b>' . $line; 01156 01157 $last = $index; 01158 } 01159 if ( $extract ) 01160 $extract .= '<b> ... </b>'; 01161 01162 $processed = array(); 01163 foreach ( $terms as $term ) { 01164 if ( ! isset( $processed[$term] ) ) { 01165 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word 01166 $extract = preg_replace( $pat3, 01167 "\\1<span class='searchmatch'>\\2</span>\\3", $extract ); 01168 $processed[$term] = true; 01169 } 01170 } 01171 01172 wfProfileOut( "$fname-extract" ); 01173 01174 return $extract; 01175 } 01176 01184 function splitAndAdd( &$extracts, &$count, $text ) { 01185 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text ); 01186 foreach ( $split as $line ) { 01187 $tt = trim( $line ); 01188 if ( $tt ) 01189 $extracts[$count++] = $tt; 01190 } 01191 } 01192 01199 function caseCallback( $matches ) { 01200 global $wgContLang; 01201 if ( strlen( $matches[0] ) > 1 ) { 01202 return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']'; 01203 } else { 01204 return $matches[0]; 01205 } 01206 } 01207 01218 function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) { 01219 if ( $start != 0 ) { 01220 $start = $this->position( $text, $start, 1 ); 01221 } 01222 if ( $end >= strlen( $text ) ) { 01223 $end = strlen( $text ); 01224 } else { 01225 $end = $this->position( $text, $end ); 01226 } 01227 01228 if ( !is_null( $posStart ) ) { 01229 $posStart = $start; 01230 } 01231 if ( !is_null( $posEnd ) ) { 01232 $posEnd = $end; 01233 } 01234 01235 if ( $end > $start ) { 01236 return substr( $text, $start, $end - $start ); 01237 } else { 01238 return ''; 01239 } 01240 } 01241 01250 function position( $text, $point, $offset = 0 ) { 01251 $tolerance = 10; 01252 $s = max( 0, $point - $tolerance ); 01253 $l = min( strlen( $text ), $point + $tolerance ) - $s; 01254 $m = array(); 01255 if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) { 01256 return $m[0][1] + $s + $offset; 01257 } else { 01258 // check if point is on a valid first UTF8 char 01259 $char = ord( $text[$point] ); 01260 while ( $char >= 0x80 && $char < 0xc0 ) { 01261 // skip trailing bytes 01262 $point++; 01263 if ( $point >= strlen( $text ) ) 01264 return strlen( $text ); 01265 $char = ord( $text[$point] ); 01266 } 01267 return $point; 01268 01269 } 01270 } 01271 01283 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) { 01284 if ( $linesleft == 0 ) 01285 return; // nothing to do 01286 foreach ( $extracts as $index => $line ) { 01287 if ( array_key_exists( $index, $out ) ) 01288 continue; // this line already highlighted 01289 01290 $m = array(); 01291 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) 01292 continue; 01293 01294 $offset = $m[0][1]; 01295 $len = strlen( $m[0][0] ); 01296 if ( $offset + $len < $contextchars ) 01297 $begin = 0; 01298 elseif ( $len > $contextchars ) 01299 $begin = $offset; 01300 else 01301 $begin = $offset + intval( ( $len - $contextchars ) / 2 ); 01302 01303 $end = $begin + $contextchars; 01304 01305 $posBegin = $begin; 01306 // basic snippet from this line 01307 $out[$index] = $this->extract( $line, $begin, $end, $posBegin ); 01308 $offsets[$index] = $posBegin; 01309 $linesleft--; 01310 if ( $linesleft == 0 ) 01311 return; 01312 } 01313 } 01314 01320 function removeWiki( $text ) { 01321 $fname = __METHOD__; 01322 wfProfileIn( $fname ); 01323 01324 // $text = preg_replace("/'{2,5}/", "", $text); 01325 // $text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text); 01326 // $text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text); 01327 // $text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text); 01328 // $text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text); 01329 // $text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text); 01330 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text ); 01331 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text ); 01332 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text ); 01333 $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text ); 01334 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text); 01335 $text = preg_replace( "/<\/?[^>]+>/", "", $text ); 01336 $text = preg_replace( "/'''''/", "", $text ); 01337 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text ); 01338 $text = preg_replace( "/''/", "", $text ); 01339 01340 wfProfileOut( $fname ); 01341 return $text; 01342 } 01343 01350 function linkReplace( $matches ) { 01351 $colon = strpos( $matches[1], ':' ); 01352 if ( $colon === false ) 01353 return $matches[2]; // replace with caption 01354 global $wgContLang; 01355 $ns = substr( $matches[1], 0, $colon ); 01356 $index = $wgContLang->getNsIndex( $ns ); 01357 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) 01358 return $matches[0]; // return the whole thing 01359 else 01360 return $matches[2]; 01361 01362 } 01363 01374 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) { 01375 global $wgContLang; 01376 $fname = __METHOD__; 01377 01378 $lines = explode( "\n", $text ); 01379 01380 $terms = implode( '|', $terms ); 01381 $max = intval( $contextchars ) + 1; 01382 $pat1 = "/(.*)($terms)(.{0,$max})/i"; 01383 01384 $lineno = 0; 01385 01386 $extract = ""; 01387 wfProfileIn( "$fname-extract" ); 01388 foreach ( $lines as $line ) { 01389 if ( 0 == $contextlines ) { 01390 break; 01391 } 01392 ++$lineno; 01393 $m = array(); 01394 if ( ! preg_match( $pat1, $line, $m ) ) { 01395 continue; 01396 } 01397 --$contextlines; 01398 // truncate function changes ... to relevant i18n message. 01399 $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false ); 01400 01401 if ( count( $m ) < 3 ) { 01402 $post = ''; 01403 } else { 01404 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false ); 01405 } 01406 01407 $found = $m[2]; 01408 01409 $line = htmlspecialchars( $pre . $found . $post ); 01410 $pat2 = '/(' . $terms . ")/i"; 01411 $line = preg_replace( $pat2, 01412 "<span class='searchmatch'>\\1</span>", $line ); 01413 01414 $extract .= "${line}\n"; 01415 } 01416 wfProfileOut( "$fname-extract" ); 01417 01418 return $extract; 01419 } 01420 01421 } 01422 01429 class SearchEngineDummy extends SearchEngine { 01430 // no-op 01431 }