MediaWiki  REL1_20
SearchEngine.php
Go to the documentation of this file.
00001 <?php
00032 class SearchEngine {
00033         var $limit = 10;
00034         var $offset = 0;
00035         var $prefix = '';
00036         var $searchTerms = array();
00037         var $namespaces = array( NS_MAIN );
00038         var $showRedirects = false;
00039 
00041         protected $features = array();
00042 
00046         protected $db;
00047 
00048         function __construct($db = null) {
00049                 if ( $db ) {
00050                         $this->db = $db;
00051                 } else {
00052                         $this->db = wfGetDB( DB_SLAVE );
00053                 }
00054         }
00055 
00064         function searchText( $term ) {
00065                 return null;
00066         }
00067 
00076         function searchTitle( $term ) {
00077                 return null;
00078         }
00079 
00085         function acceptListRedirects() {
00086                 wfDeprecated( __METHOD__, '1.18' );
00087                 return $this->supports( 'list-redirects' );
00088         }
00089 
00095         public function supports( $feature ) {
00096                 switch( $feature ) {
00097                 case 'list-redirects':
00098                         return true;
00099                 case 'title-suffix-filter':
00100                 default:
00101                         return false;
00102                 }
00103         }
00104 
00112         public function setFeatureData( $feature, $data ) {
00113                 $this->features[$feature] = $data;
00114         }
00115 
00124         public function normalizeText( $string ) {
00125                 global $wgContLang;
00126 
00127                 // Some languages such as Chinese require word segmentation
00128                 return $wgContLang->segmentByWord( $string );
00129         }
00130 
00135         function transformSearchTerm( $term ) {
00136                 return $term;
00137         }
00138 
00146         public static function getNearMatch( $searchterm ) {
00147                 $title = self::getNearMatchInternal( $searchterm );
00148 
00149                 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) );
00150                 return $title;
00151         }
00152 
00160         public static function getNearMatchResultSet( $searchterm ) {
00161                 return new SearchNearMatchResultSet( self::getNearMatch( $searchterm ) );
00162         }
00163 
00168         private static function getNearMatchInternal( $searchterm ) {
00169                 global $wgContLang, $wgEnableSearchContributorsByIP;
00170 
00171                 $allSearchTerms = array( $searchterm );
00172 
00173                 if ( $wgContLang->hasVariants() ) {
00174                         $allSearchTerms = array_merge( $allSearchTerms, $wgContLang->autoConvertToAllVariants( $searchterm ) );
00175                 }
00176 
00177                 $titleResult = null;
00178                 if ( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) {
00179                         return $titleResult;
00180                 }
00181 
00182                 foreach ( $allSearchTerms as $term ) {
00183 
00184                         # Exact match? No need to look further.
00185                         $title = Title::newFromText( $term );
00186                         if ( is_null( $title ) ){
00187                                 return null;
00188                         }
00189 
00190                         if ( $title->isSpecialPage() || $title->isExternal() || $title->exists() ) {
00191                                 return $title;
00192                         }
00193 
00194                         # See if it still otherwise has content is some sane sense
00195                         $page = WikiPage::factory( $title );
00196                         if ( $page->hasViewableContent() ) {
00197                                 return $title;
00198                         }
00199 
00200                         # Now try all lower case (i.e. first letter capitalized)
00201                         #
00202                         $title = Title::newFromText( $wgContLang->lc( $term ) );
00203                         if ( $title && $title->exists() ) {
00204                                 return $title;
00205                         }
00206 
00207                         # Now try capitalized string
00208                         #
00209                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
00210                         if ( $title && $title->exists() ) {
00211                                 return $title;
00212                         }
00213 
00214                         # Now try all upper case
00215                         #
00216                         $title = Title::newFromText( $wgContLang->uc( $term ) );
00217                         if ( $title && $title->exists() ) {
00218                                 return $title;
00219                         }
00220 
00221                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
00222                         $title = Title::newFromText( $wgContLang->ucwordbreaks( $term ) );
00223                         if ( $title && $title->exists() ) {
00224                                 return $title;
00225                         }
00226 
00227                         // Give hooks a chance at better match variants
00228                         $title = null;
00229                         if ( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
00230                                 return $title;
00231                         }
00232                 }
00233 
00234                 $title = Title::newFromText( $searchterm );
00235 
00236 
00237                 # Entering an IP address goes to the contributions page
00238                 if ( $wgEnableSearchContributorsByIP ) {
00239                         if ( ( $title->getNamespace() == NS_USER && User::isIP( $title->getText() ) )
00240                                 || User::isIP( trim( $searchterm ) ) ) {
00241                                 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
00242                         }
00243                 }
00244 
00245 
00246                 # Entering a user goes to the user page whether it's there or not
00247                 if ( $title->getNamespace() == NS_USER ) {
00248                         return $title;
00249                 }
00250 
00251                 # Go to images that exist even if there's no local page.
00252                 # There may have been a funny upload, or it may be on a shared
00253                 # file repository such as Wikimedia Commons.
00254                 if ( $title->getNamespace() == NS_FILE ) {
00255                         $image = wfFindFile( $title );
00256                         if ( $image ) {
00257                                 return $title;
00258                         }
00259                 }
00260 
00261                 # MediaWiki namespace? Page may be "implied" if not customized.
00262                 # Just return it, with caps forced as the message system likes it.
00263                 if ( $title->getNamespace() == NS_MEDIAWIKI ) {
00264                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
00265                 }
00266 
00267                 # Quoted term? Try without the quotes...
00268                 $matches = array();
00269                 if ( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
00270                         return SearchEngine::getNearMatch( $matches[1] );
00271                 }
00272 
00273                 return null;
00274         }
00275 
00276         public static function legalSearchChars() {
00277                 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
00278         }
00279 
00287         function setLimitOffset( $limit, $offset = 0 ) {
00288                 $this->limit = intval( $limit );
00289                 $this->offset = intval( $offset );
00290         }
00291 
00298         function setNamespaces( $namespaces ) {
00299                 $this->namespaces = $namespaces;
00300         }
00301 
00309         function replacePrefixes( $query ) {
00310                 global $wgContLang;
00311 
00312                 $parsed = $query;
00313                 if ( strpos( $query, ':' ) === false ) { // nothing to do
00314                         wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
00315                         return $parsed;
00316                 }
00317 
00318                 $allkeyword = wfMessage( 'searchall' )->inContentLanguage()->text() . ":";
00319                 if ( strncmp( $query, $allkeyword, strlen( $allkeyword ) ) == 0 ) {
00320                         $this->namespaces = null;
00321                         $parsed = substr( $query, strlen( $allkeyword ) );
00322                 } elseif ( strpos( $query, ':' ) !== false ) {
00323                         $prefix = substr( $query, 0, strpos( $query, ':' ) );
00324                         $index = $wgContLang->getNsIndex( $prefix );
00325                         if ( $index !== false ) {
00326                                 $this->namespaces = array( $index );
00327                                 $parsed = substr( $query, strlen( $prefix ) + 1 );
00328                         }
00329                 }
00330                 if ( trim( $parsed ) == '' )
00331                         $parsed = $query; // prefix was the whole query
00332 
00333                 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
00334 
00335                 return $parsed;
00336         }
00337 
00342         public static function searchableNamespaces() {
00343                 global $wgContLang;
00344                 $arr = array();
00345                 foreach ( $wgContLang->getNamespaces() as $ns => $name ) {
00346                         if ( $ns >= NS_MAIN ) {
00347                                 $arr[$ns] = $name;
00348                         }
00349                 }
00350 
00351                 wfRunHooks( 'SearchableNamespaces', array( &$arr ) );
00352                 return $arr;
00353         }
00354 
00362         public static function userNamespaces( $user ) {
00363                 global $wgSearchEverythingOnlyLoggedIn;
00364 
00365                 $searchableNamespaces = SearchEngine::searchableNamespaces();
00366 
00367                 // get search everything preference, that can be set to be read for logged-in users
00368                 // it overrides other options
00369                 if ( !$wgSearchEverythingOnlyLoggedIn || $user->isLoggedIn() ) {
00370                         if ( $user->getOption( 'searcheverything' ) ) {
00371                                 return array_keys( $searchableNamespaces );
00372                         }
00373                 }
00374 
00375                 $arr = array();
00376                 foreach ( $searchableNamespaces as $ns => $name ) {
00377                         if ( $user->getOption( 'searchNs' . $ns ) ) {
00378                                 $arr[] = $ns;
00379                         }
00380                 }
00381 
00382                 return $arr;
00383         }
00384 
00390         public static function userHighlightPrefs() {
00391                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
00392                 $contextchars = 75; // same as above.... :P
00393                 return array( $contextlines, $contextchars );
00394         }
00395 
00401         public static function defaultNamespaces() {
00402                 global $wgNamespacesToBeSearchedDefault;
00403 
00404                 return array_keys( $wgNamespacesToBeSearchedDefault, true );
00405         }
00406 
00414         public static function namespacesAsText( $namespaces ) {
00415                 global $wgContLang;
00416 
00417                 $formatted = array_map( array( $wgContLang, 'getFormattedNsText' ), $namespaces );
00418                 foreach ( $formatted as $key => $ns ) {
00419                         if ( empty( $ns ) )
00420                                 $formatted[$key] = wfMessage( 'blanknamespace' )->text();
00421                 }
00422                 return $formatted;
00423         }
00424 
00430         public static function helpNamespaces() {
00431                 global $wgNamespacesToBeSearchedHelp;
00432 
00433                 return array_keys( $wgNamespacesToBeSearchedHelp, true );
00434         }
00435 
00442         function filter( $text ) {
00443                 $lc = $this->legalSearchChars();
00444                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
00445         }
00452         public static function create() {
00453                 global $wgSearchType;
00454                 $dbr = null;
00455                 if ( $wgSearchType ) {
00456                         $class = $wgSearchType;
00457                 } else {
00458                         $dbr = wfGetDB( DB_SLAVE );
00459                         $class = $dbr->getSearchEngine();
00460                 }
00461                 $search = new $class( $dbr );
00462                 $search->setLimitOffset( 0, 0 );
00463                 return $search;
00464         }
00465 
00475         function update( $id, $title, $text ) {
00476                 // no-op
00477         }
00478 
00487         function updateTitle( $id, $title ) {
00488                 // no-op
00489         }
00490 
00496         public static function getOpenSearchTemplate() {
00497                 global $wgOpenSearchTemplate, $wgCanonicalServer;
00498                 if ( $wgOpenSearchTemplate ) {
00499                         return $wgOpenSearchTemplate;
00500                 } else {
00501                         $ns = implode( '|', SearchEngine::defaultNamespaces() );
00502                         if ( !$ns ) {
00503                                 $ns = "0";
00504                         }
00505                         return $wgCanonicalServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace=' . $ns;
00506                 }
00507         }
00508 }
00509 
00513 class SearchResultSet {
00521         function termMatches() {
00522                 return array();
00523         }
00524 
00525         function numRows() {
00526                 return 0;
00527         }
00528 
00535         function hasResults() {
00536                 return false;
00537         }
00538 
00549         function getTotalHits() {
00550                 return null;
00551         }
00552 
00559         function hasSuggestion() {
00560                 return false;
00561         }
00562 
00566         function getSuggestionQuery() {
00567                 return null;
00568         }
00569 
00573         function getSuggestionSnippet() {
00574                 return '';
00575         }
00576 
00583         function getInfo() {
00584                 return null;
00585         }
00586 
00592         function getInterwikiResults() {
00593                 return null;
00594         }
00595 
00601         function hasInterwikiResults() {
00602                 return $this->getInterwikiResults() != null;
00603         }
00604 
00611         function next() {
00612                 return false;
00613         }
00614 
00618         function free() {
00619                 // ...
00620         }
00621 }
00622 
00626 class SqlSearchResultSet extends SearchResultSet {
00627 
00628         protected $mResultSet;
00629 
00630         function __construct( $resultSet, $terms ) {
00631                 $this->mResultSet = $resultSet;
00632                 $this->mTerms = $terms;
00633         }
00634 
00635         function termMatches() {
00636                 return $this->mTerms;
00637         }
00638 
00639         function numRows() {
00640                 if ( $this->mResultSet === false )
00641                         return false;
00642 
00643                 return $this->mResultSet->numRows();
00644         }
00645 
00646         function next() {
00647                 if ( $this->mResultSet === false )
00648                         return false;
00649 
00650                 $row = $this->mResultSet->fetchObject();
00651                 if ( $row === false )
00652                         return false;
00653 
00654                 return SearchResult::newFromRow( $row );
00655         }
00656 
00657         function free() {
00658                 if ( $this->mResultSet === false )
00659                         return false;
00660 
00661                 $this->mResultSet->free();
00662         }
00663 }
00664 
00668 class SearchResultTooMany {
00669         # # Some search engines may bail out if too many matches are found
00670 }
00671 
00672 
00679 class SearchResult {
00680 
00684         var $mRevision = null;
00685         var $mImage = null;
00686 
00690         var $mTitle;
00691 
00695         var $mText;
00696 
00703         public static function newFromTitle( $title ) {
00704                 $result = new self();
00705                 $result->initFromTitle( $title );
00706                 return $result;
00707         }
00714         public static function newFromRow( $row ) {
00715                 $result = new self();
00716                 $result->initFromRow( $row );
00717                 return $result;
00718         }
00719 
00720         public function __construct( $row = null ) {
00721                 if ( !is_null( $row ) ) {
00722                         // Backwards compatibility with pre-1.17 callers
00723                         $this->initFromRow( $row );
00724                 }
00725         }
00726 
00733         protected function initFromRow( $row ) {
00734                 $this->initFromTitle( Title::makeTitle( $row->page_namespace, $row->page_title ) );
00735         }
00736 
00743         protected function initFromTitle( $title ) {
00744                 $this->mTitle = $title;
00745                 if ( !is_null( $this->mTitle ) ) {
00746                         $id = false;
00747                         wfRunHooks( 'SearchResultInitFromTitle', array( $title, &$id ) );
00748                         $this->mRevision = Revision::newFromTitle(
00749                                 $this->mTitle, $id, Revision::READ_NORMAL );
00750                         if ( $this->mTitle->getNamespace() === NS_FILE )
00751                                 $this->mImage = wfFindFile( $this->mTitle );
00752                 }
00753         }
00754 
00760         function isBrokenTitle() {
00761                 if ( is_null( $this->mTitle ) )
00762                         return true;
00763                 return false;
00764         }
00765 
00771         function isMissingRevision() {
00772                 return !$this->mRevision && !$this->mImage;
00773         }
00774 
00778         function getTitle() {
00779                 return $this->mTitle;
00780         }
00781 
00785         function getScore() {
00786                 return null;
00787         }
00788 
00792         protected function initText() {
00793                 if ( !isset( $this->mText ) ) {
00794                         if ( $this->mRevision != null )
00795                                 $this->mText = $this->mRevision->getText();
00796                         else // TODO: can we fetch raw wikitext for commons images?
00797                                 $this->mText = '';
00798 
00799                 }
00800         }
00801 
00806         function getTextSnippet( $terms ) {
00807                 global $wgUser, $wgAdvancedSearchHighlighting;
00808                 $this->initText();
00809                 list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs( $wgUser );
00810                 $h = new SearchHighlighter();
00811                 if ( $wgAdvancedSearchHighlighting )
00812                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
00813                 else
00814                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
00815         }
00816 
00821         function getTitleSnippet( $terms ) {
00822                 return '';
00823         }
00824 
00829         function getRedirectSnippet( $terms ) {
00830                 return '';
00831         }
00832 
00836         function getRedirectTitle() {
00837                 return null;
00838         }
00839 
00843         function getSectionSnippet() {
00844                 return '';
00845         }
00846 
00850         function getSectionTitle() {
00851                 return null;
00852         }
00853 
00857         function getTimestamp() {
00858                 if ( $this->mRevision )
00859                         return $this->mRevision->getTimestamp();
00860                 elseif ( $this->mImage )
00861                         return $this->mImage->getTimestamp();
00862                 return '';
00863         }
00864 
00868         function getWordCount() {
00869                 $this->initText();
00870                 return str_word_count( $this->mText );
00871         }
00872 
00876         function getByteSize() {
00877                 $this->initText();
00878                 return strlen( $this->mText );
00879         }
00880 
00884         function hasRelated() {
00885                 return false;
00886         }
00887 
00891         function getInterwikiPrefix() {
00892                 return '';
00893         }
00894 }
00898 class SearchNearMatchResultSet extends SearchResultSet {
00899         private $fetched = false;
00903         public function __construct( $match ) {
00904                 $this->result = $match;
00905         }
00906         public function hasResult() {
00907                 return (bool)$this->result;
00908         }
00909         public function numRows() {
00910                 return $this->hasResults() ? 1 : 0;
00911         }
00912         public function next() {
00913                 if ( $this->fetched || !$this->result ) {
00914                         return false;
00915                 }
00916                 $this->fetched = true;
00917                 return SearchResult::newFromTitle( $this->result );
00918         }
00919 }
00920 
00926 class SearchHighlighter {
00927         var $mCleanWikitext = true;
00928 
00929         function __construct( $cleanupWikitext = true ) {
00930                 $this->mCleanWikitext = $cleanupWikitext;
00931         }
00932 
00942         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
00943                 global $wgContLang;
00944                 global $wgSearchHighlightBoundaries;
00945                 $fname = __METHOD__;
00946 
00947                 if ( $text == '' )
00948                         return '';
00949 
00950                 // spli text into text + templates/links/tables
00951                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
00952                 // first capture group is for detecting nested templates/links/tables/references
00953                 $endPatterns = array(
00954                         1 => '/(\{\{)|(\}\})/', // template
00955                         2 => '/(\[\[)|(\]\])/', // image
00956                         3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
00957 
00958                 // @todo FIXME: This should prolly be a hook or something
00959                 if ( function_exists( 'wfCite' ) ) {
00960                         $spat .= '|(<ref>)'; // references via cite extension
00961                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
00962                 }
00963                 $spat .= '/';
00964                 $textExt = array(); // text extracts
00965                 $otherExt = array();  // other extracts
00966                 wfProfileIn( "$fname-split" );
00967                 $start = 0;
00968                 $textLen = strlen( $text );
00969                 $count = 0; // sequence number to maintain ordering
00970                 while ( $start < $textLen ) {
00971                         // find start of template/image/table
00972                         if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
00973                                 $epat = '';
00974                                 foreach ( $matches as $key => $val ) {
00975                                         if ( $key > 0 && $val[1] != - 1 ) {
00976                                                 if ( $key == 2 ) {
00977                                                         // see if this is an image link
00978                                                         $ns = substr( $val[0], 2, - 1 );
00979                                                         if ( $wgContLang->getNsIndex( $ns ) != NS_FILE )
00980                                                                 break;
00981 
00982                                                 }
00983                                                 $epat = $endPatterns[$key];
00984                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
00985                                                 $start = $val[1];
00986                                                 break;
00987                                         }
00988                                 }
00989                                 if ( $epat ) {
00990                                         // find end (and detect any nested elements)
00991                                         $level = 0;
00992                                         $offset = $start + 1;
00993                                         $found = false;
00994                                         while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
00995                                                 if ( array_key_exists( 2, $endMatches ) ) {
00996                                                         // found end
00997                                                         if ( $level == 0 ) {
00998                                                                 $len = strlen( $endMatches[2][0] );
00999                                                                 $off = $endMatches[2][1];
01000                                                                 $this->splitAndAdd( $otherExt, $count,
01001                                                                         substr( $text, $start, $off + $len  - $start ) );
01002                                                                 $start = $off + $len;
01003                                                                 $found = true;
01004                                                                 break;
01005                                                         } else {
01006                                                                 // end of nested element
01007                                                                 $level -= 1;
01008                                                         }
01009                                                 } else {
01010                                                         // nested
01011                                                         $level += 1;
01012                                                 }
01013                                                 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
01014                                         }
01015                                         if ( ! $found ) {
01016                                                 // couldn't find appropriate closing tag, skip
01017                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
01018                                                 $start += strlen( $matches[0][0] );
01019                                         }
01020                                         continue;
01021                                 }
01022                         }
01023                         // else: add as text extract
01024                         $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
01025                         break;
01026                 }
01027 
01028                 $all = $textExt + $otherExt; // these have disjunct key sets
01029 
01030                 wfProfileOut( "$fname-split" );
01031 
01032                 // prepare regexps
01033                 foreach ( $terms as $index => $term ) {
01034                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
01035                         if ( preg_match( '/[\x80-\xff]/', $term ) ) {
01036                                 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] );
01037                         } else {
01038                                 $terms[$index] = $term;
01039                         }
01040                 }
01041                 $anyterm = implode( '|', $terms );
01042                 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
01043 
01044                 // @todo FIXME: A hack to scale contextchars, a correct solution
01045                 // would be to have contextchars actually be char and not byte
01046                 // length, and do proper utf-8 substrings and lengths everywhere,
01047                 // but PHP is making that very hard and unclean to implement :(
01048                 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
01049                 $contextchars = intval( $contextchars * $scale );
01050 
01051                 $patPre = "(^|$wgSearchHighlightBoundaries)";
01052                 $patPost = "($wgSearchHighlightBoundaries|$)";
01053 
01054                 $pat1 = "/(" . $phrase . ")/ui";
01055                 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
01056 
01057                 wfProfileIn( "$fname-extract" );
01058 
01059                 $left = $contextlines;
01060 
01061                 $snippets = array();
01062                 $offsets = array();
01063 
01064                 // show beginning only if it contains all words
01065                 $first = 0;
01066                 $firstText = '';
01067                 foreach ( $textExt as $index => $line ) {
01068                         if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
01069                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
01070                                 $first = $index;
01071                                 break;
01072                         }
01073                 }
01074                 if ( $firstText ) {
01075                         $succ = true;
01076                         // check if first text contains all terms
01077                         foreach ( $terms as $term ) {
01078                                 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
01079                                         $succ = false;
01080                                         break;
01081                                 }
01082                         }
01083                         if ( $succ ) {
01084                                 $snippets[$first] = $firstText;
01085                                 $offsets[$first] = 0;
01086                         }
01087                 }
01088                 if ( ! $snippets ) {
01089                         // match whole query on text
01090                         $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
01091                         // match whole query on templates/tables/images
01092                         $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
01093                         // match any words on text
01094                         $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
01095                         // match any words on templates/tables/images
01096                         $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
01097 
01098                         ksort( $snippets );
01099                 }
01100 
01101                 // add extra chars to each snippet to make snippets constant size
01102                 $extended = array();
01103                 if ( count( $snippets ) == 0 ) {
01104                         // couldn't find the target words, just show beginning of article
01105                         if ( array_key_exists( $first, $all ) ) {
01106                                 $targetchars = $contextchars * $contextlines;
01107                                 $snippets[$first] = '';
01108                                 $offsets[$first] = 0;
01109                         }
01110                 } else {
01111                         // if begin of the article contains the whole phrase, show only that !!
01112                         if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
01113                                 && $offsets[$first] < $contextchars * 2 ) {
01114                                 $snippets = array ( $first => $snippets[$first] );
01115                         }
01116 
01117                         // calc by how much to extend existing snippets
01118                         $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
01119                 }
01120 
01121                 foreach ( $snippets as $index => $line ) {
01122                         $extended[$index] = $line;
01123                         $len = strlen( $line );
01124                         if ( $len < $targetchars - 20 ) {
01125                                 // complete this line
01126                                 if ( $len < strlen( $all[$index] ) ) {
01127                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] );
01128                                         $len = strlen( $extended[$index] );
01129                                 }
01130 
01131                                 // add more lines
01132                                 $add = $index + 1;
01133                                 while ( $len < $targetchars - 20
01134                                            && array_key_exists( $add, $all )
01135                                            && !array_key_exists( $add, $snippets ) ) {
01136                                         $offsets[$add] = 0;
01137                                         $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
01138                                         $extended[$add] = $tt;
01139                                         $len += strlen( $tt );
01140                                         $add++;
01141                                 }
01142                         }
01143                 }
01144 
01145                 // $snippets = array_map('htmlspecialchars', $extended);
01146                 $snippets = $extended;
01147                 $last = - 1;
01148                 $extract = '';
01149                 foreach ( $snippets as $index => $line ) {
01150                         if ( $last == - 1 )
01151                                 $extract .= $line; // first line
01152                         elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) )
01153                                 $extract .= " " . $line; // continous lines
01154                         else
01155                                 $extract .= '<b> ... </b>' . $line;
01156 
01157                         $last = $index;
01158                 }
01159                 if ( $extract )
01160                         $extract .= '<b> ... </b>';
01161 
01162                 $processed = array();
01163                 foreach ( $terms as $term ) {
01164                         if ( ! isset( $processed[$term] ) ) {
01165                                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
01166                                 $extract = preg_replace( $pat3,
01167                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
01168                                 $processed[$term] = true;
01169                         }
01170                 }
01171 
01172                 wfProfileOut( "$fname-extract" );
01173 
01174                 return $extract;
01175         }
01176 
01184         function splitAndAdd( &$extracts, &$count, $text ) {
01185                 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
01186                 foreach ( $split as $line ) {
01187                         $tt = trim( $line );
01188                         if ( $tt )
01189                                 $extracts[$count++] = $tt;
01190                 }
01191         }
01192 
01199         function caseCallback( $matches ) {
01200                 global $wgContLang;
01201                 if ( strlen( $matches[0] ) > 1 ) {
01202                         return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
01203                 } else {
01204                         return $matches[0];
01205                 }
01206         }
01207 
01218         function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
01219                 if ( $start != 0 ) {
01220                         $start = $this->position( $text, $start, 1 );
01221                 }
01222                 if ( $end >= strlen( $text ) ) {
01223                         $end = strlen( $text );
01224                 } else {
01225                         $end = $this->position( $text, $end );
01226                 }
01227 
01228                 if ( !is_null( $posStart ) ) {
01229                         $posStart = $start;
01230                 }
01231                 if ( !is_null( $posEnd ) ) {
01232                         $posEnd = $end;
01233                 }
01234 
01235                 if ( $end > $start )  {
01236                         return substr( $text, $start, $end - $start );
01237                 } else {
01238                         return '';
01239                 }
01240         }
01241 
01250         function position( $text, $point, $offset = 0 ) {
01251                 $tolerance = 10;
01252                 $s = max( 0, $point - $tolerance );
01253                 $l = min( strlen( $text ), $point + $tolerance ) - $s;
01254                 $m = array();
01255                 if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) {
01256                         return $m[0][1] + $s + $offset;
01257                 } else {
01258                         // check if point is on a valid first UTF8 char
01259                         $char = ord( $text[$point] );
01260                         while ( $char >= 0x80 && $char < 0xc0 ) {
01261                                 // skip trailing bytes
01262                                 $point++;
01263                                 if ( $point >= strlen( $text ) )
01264                                         return strlen( $text );
01265                                 $char = ord( $text[$point] );
01266                         }
01267                         return $point;
01268 
01269                 }
01270         }
01271 
01283         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
01284                 if ( $linesleft == 0 )
01285                         return; // nothing to do
01286                 foreach ( $extracts as $index => $line ) {
01287                         if ( array_key_exists( $index, $out ) )
01288                                 continue; // this line already highlighted
01289 
01290                         $m = array();
01291                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
01292                                 continue;
01293 
01294                         $offset = $m[0][1];
01295                         $len = strlen( $m[0][0] );
01296                         if ( $offset + $len < $contextchars )
01297                                 $begin = 0;
01298                         elseif ( $len > $contextchars )
01299                                 $begin = $offset;
01300                         else
01301                                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
01302 
01303                         $end = $begin + $contextchars;
01304 
01305                         $posBegin = $begin;
01306                         // basic snippet from this line
01307                         $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
01308                         $offsets[$index] = $posBegin;
01309                         $linesleft--;
01310                         if ( $linesleft == 0 )
01311                                 return;
01312                 }
01313         }
01314 
01320         function removeWiki( $text ) {
01321                 $fname = __METHOD__;
01322                 wfProfileIn( $fname );
01323 
01324                 // $text = preg_replace("/'{2,5}/", "", $text);
01325                 // $text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
01326                 // $text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
01327                 // $text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
01328                 // $text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
01329                 // $text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
01330                 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
01331                 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
01332                 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
01333                 $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text );
01334                 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
01335                 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
01336                 $text = preg_replace( "/'''''/", "", $text );
01337                 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
01338                 $text = preg_replace( "/''/", "", $text );
01339 
01340                 wfProfileOut( $fname );
01341                 return $text;
01342         }
01343 
01350         function linkReplace( $matches ) {
01351                 $colon = strpos( $matches[1], ':' );
01352                 if ( $colon === false )
01353                         return $matches[2]; // replace with caption
01354                 global $wgContLang;
01355                 $ns = substr( $matches[1], 0, $colon );
01356                 $index = $wgContLang->getNsIndex( $ns );
01357                 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) )
01358                         return $matches[0]; // return the whole thing
01359                 else
01360                         return $matches[2];
01361 
01362         }
01363 
01374         public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
01375                 global $wgContLang;
01376                 $fname = __METHOD__;
01377 
01378                 $lines = explode( "\n", $text );
01379 
01380                 $terms = implode( '|', $terms );
01381                 $max = intval( $contextchars ) + 1;
01382                 $pat1 = "/(.*)($terms)(.{0,$max})/i";
01383 
01384                 $lineno = 0;
01385 
01386                 $extract = "";
01387                 wfProfileIn( "$fname-extract" );
01388                 foreach ( $lines as $line ) {
01389                         if ( 0 == $contextlines ) {
01390                                 break;
01391                         }
01392                         ++$lineno;
01393                         $m = array();
01394                         if ( ! preg_match( $pat1, $line, $m ) ) {
01395                                 continue;
01396                         }
01397                         --$contextlines;
01398                         // truncate function changes ... to relevant i18n message.
01399                         $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
01400 
01401                         if ( count( $m ) < 3 ) {
01402                                 $post = '';
01403                         } else {
01404                                 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
01405                         }
01406 
01407                         $found = $m[2];
01408 
01409                         $line = htmlspecialchars( $pre . $found . $post );
01410                         $pat2 = '/(' . $terms . ")/i";
01411                         $line = preg_replace( $pat2,
01412                           "<span class='searchmatch'>\\1</span>", $line );
01413 
01414                         $extract .= "${line}\n";
01415                 }
01416                 wfProfileOut( "$fname-extract" );
01417 
01418                 return $extract;
01419         }
01420 
01421 }
01422 
01429 class SearchEngineDummy extends SearchEngine {
01430         // no-op
01431 }