MediaWiki  REL1_19
SearchEngine.php
Go to the documentation of this file.
00001 <?php
00017 class SearchEngine {
00018         var $limit = 10;
00019         var $offset = 0;
00020         var $prefix = '';
00021         var $searchTerms = array();
00022         var $namespaces = array( NS_MAIN );
00023         var $showRedirects = false;
00024 
00026         protected $features = array();
00027 
00031         protected $db;
00032 
00033         function __construct($db = null) {
00034                 if ( $db ) {
00035                         $this->db = $db;
00036                 } else {
00037                         $this->db = wfGetDB( DB_SLAVE );
00038                 }
00039         }
00040 
00049         function searchText( $term ) {
00050                 return null;
00051         }
00052 
00061         function searchTitle( $term ) {
00062                 return null;
00063         }
00064 
00069         function acceptListRedirects() {
00070                 wfDeprecated( __METHOD__, '1.18' );
00071                 return $this->supports( 'list-redirects' );
00072         }
00073 
00079         public function supports( $feature ) {
00080                 switch( $feature ) {
00081                 case 'list-redirects':
00082                         return true;
00083                 case 'title-suffix-filter':
00084                 default:
00085                         return false;
00086                 }
00087         }
00088 
00096         public function setFeatureData( $feature, $data ) {
00097                 $this->features[$feature] = $data;
00098         }
00099 
00108         public function normalizeText( $string ) {
00109                 global $wgContLang;
00110 
00111                 // Some languages such as Chinese require word segmentation
00112                 return $wgContLang->segmentByWord( $string );
00113         }
00114 
00119         function transformSearchTerm( $term ) {
00120                 return $term;
00121         }
00122 
00130         public static function getNearMatch( $searchterm ) {
00131                 $title = self::getNearMatchInternal( $searchterm );
00132 
00133                 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) );
00134                 return $title;
00135         }
00136 
00144         public static function getNearMatchResultSet( $searchterm ) {
00145                 return new SearchNearMatchResultSet( self::getNearMatch( $searchterm ) );
00146         }
00147 
00151         private static function getNearMatchInternal( $searchterm ) {
00152                 global $wgContLang, $wgEnableSearchContributorsByIP;
00153 
00154                 $allSearchTerms = array( $searchterm );
00155 
00156                 if ( $wgContLang->hasVariants() ) {
00157                         $allSearchTerms = array_merge( $allSearchTerms, $wgContLang->autoConvertToAllVariants( $searchterm ) );
00158                 }
00159 
00160                 $titleResult = null;
00161                 if ( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) {
00162                         return $titleResult;
00163                 }
00164 
00165                 foreach ( $allSearchTerms as $term ) {
00166 
00167                         # Exact match? No need to look further.
00168                         $title = Title::newFromText( $term );
00169                         if ( is_null( $title ) ){
00170                                 return null;
00171                         }
00172 
00173                         if ( $title->isSpecialPage() || $title->isExternal() || $title->exists() ) {
00174                                 return $title;
00175                         }
00176 
00177                         # See if it still otherwise has content is some sane sense
00178                         $page = WikiPage::factory( $title );
00179                         if ( $page->hasViewableContent() ) {
00180                                 return $title;
00181                         }
00182 
00183                         # Now try all lower case (i.e. first letter capitalized)
00184                         #
00185                         $title = Title::newFromText( $wgContLang->lc( $term ) );
00186                         if ( $title && $title->exists() ) {
00187                                 return $title;
00188                         }
00189 
00190                         # Now try capitalized string
00191                         #
00192                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
00193                         if ( $title && $title->exists() ) {
00194                                 return $title;
00195                         }
00196 
00197                         # Now try all upper case
00198                         #
00199                         $title = Title::newFromText( $wgContLang->uc( $term ) );
00200                         if ( $title && $title->exists() ) {
00201                                 return $title;
00202                         }
00203 
00204                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
00205                         $title = Title::newFromText( $wgContLang->ucwordbreaks( $term ) );
00206                         if ( $title && $title->exists() ) {
00207                                 return $title;
00208                         }
00209 
00210                         // Give hooks a chance at better match variants
00211                         $title = null;
00212                         if ( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
00213                                 return $title;
00214                         }
00215                 }
00216 
00217                 $title = Title::newFromText( $searchterm );
00218 
00219 
00220                 # Entering an IP address goes to the contributions page
00221                 if ( $wgEnableSearchContributorsByIP ) {
00222                         if ( ( $title->getNamespace() == NS_USER && User::isIP( $title->getText() ) )
00223                                 || User::isIP( trim( $searchterm ) ) ) {
00224                                 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
00225                         }
00226                 }
00227 
00228 
00229                 # Entering a user goes to the user page whether it's there or not
00230                 if ( $title->getNamespace() == NS_USER ) {
00231                         return $title;
00232                 }
00233 
00234                 # Go to images that exist even if there's no local page.
00235                 # There may have been a funny upload, or it may be on a shared
00236                 # file repository such as Wikimedia Commons.
00237                 if ( $title->getNamespace() == NS_FILE ) {
00238                         $image = wfFindFile( $title );
00239                         if ( $image ) {
00240                                 return $title;
00241                         }
00242                 }
00243 
00244                 # MediaWiki namespace? Page may be "implied" if not customized.
00245                 # Just return it, with caps forced as the message system likes it.
00246                 if ( $title->getNamespace() == NS_MEDIAWIKI ) {
00247                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
00248                 }
00249 
00250                 # Quoted term? Try without the quotes...
00251                 $matches = array();
00252                 if ( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
00253                         return SearchEngine::getNearMatch( $matches[1] );
00254                 }
00255 
00256                 return null;
00257         }
00258 
00259         public static function legalSearchChars() {
00260                 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
00261         }
00262 
00270         function setLimitOffset( $limit, $offset = 0 ) {
00271                 $this->limit = intval( $limit );
00272                 $this->offset = intval( $offset );
00273         }
00274 
00281         function setNamespaces( $namespaces ) {
00282                 $this->namespaces = $namespaces;
00283         }
00284 
00291         function replacePrefixes( $query ) {
00292                 global $wgContLang;
00293 
00294                 $parsed = $query;
00295                 if ( strpos( $query, ':' ) === false ) { // nothing to do
00296                         wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
00297                         return $parsed;
00298                 }
00299 
00300                 $allkeyword = wfMsgForContent( 'searchall' ) . ":";
00301                 if ( strncmp( $query, $allkeyword, strlen( $allkeyword ) ) == 0 ) {
00302                         $this->namespaces = null;
00303                         $parsed = substr( $query, strlen( $allkeyword ) );
00304                 } elseif ( strpos( $query, ':' ) !== false ) {
00305                         $prefix = substr( $query, 0, strpos( $query, ':' ) );
00306                         $index = $wgContLang->getNsIndex( $prefix );
00307                         if ( $index !== false ) {
00308                                 $this->namespaces = array( $index );
00309                                 $parsed = substr( $query, strlen( $prefix ) + 1 );
00310                         }
00311                 }
00312                 if ( trim( $parsed ) == '' )
00313                         $parsed = $query; // prefix was the whole query
00314 
00315                 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
00316 
00317                 return $parsed;
00318         }
00319 
00324         public static function searchableNamespaces() {
00325                 global $wgContLang;
00326                 $arr = array();
00327                 foreach ( $wgContLang->getNamespaces() as $ns => $name ) {
00328                         if ( $ns >= NS_MAIN ) {
00329                                 $arr[$ns] = $name;
00330                         }
00331                 }
00332 
00333                 wfRunHooks( 'SearchableNamespaces', array( &$arr ) );
00334                 return $arr;
00335         }
00336 
00344         public static function userNamespaces( $user ) {
00345                 global $wgSearchEverythingOnlyLoggedIn;
00346 
00347                 $searchableNamespaces = SearchEngine::searchableNamespaces();
00348 
00349                 // get search everything preference, that can be set to be read for logged-in users
00350                 // it overrides other options
00351                 if ( !$wgSearchEverythingOnlyLoggedIn || $user->isLoggedIn() ) {
00352                         if ( $user->getOption( 'searcheverything' ) ) {
00353                                 return array_keys( $searchableNamespaces );
00354                         }
00355                 }
00356 
00357                 $arr = array();
00358                 foreach ( $searchableNamespaces as $ns => $name ) {
00359                         if ( $user->getOption( 'searchNs' . $ns ) ) {
00360                                 $arr[] = $ns;
00361                         }
00362                 }
00363 
00364                 return $arr;
00365         }
00366 
00372         public static function userHighlightPrefs() {
00373                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
00374                 $contextchars = 75; // same as above.... :P
00375                 return array( $contextlines, $contextchars );
00376         }
00377 
00383         public static function defaultNamespaces() {
00384                 global $wgNamespacesToBeSearchedDefault;
00385 
00386                 return array_keys( $wgNamespacesToBeSearchedDefault, true );
00387         }
00388 
00395         public static function namespacesAsText( $namespaces ) {
00396                 global $wgContLang;
00397 
00398                 $formatted = array_map( array( $wgContLang, 'getFormattedNsText' ), $namespaces );
00399                 foreach ( $formatted as $key => $ns ) {
00400                         if ( empty( $ns ) )
00401                                 $formatted[$key] = wfMsg( 'blanknamespace' );
00402                 }
00403                 return $formatted;
00404         }
00405 
00411         public static function helpNamespaces() {
00412                 global $wgNamespacesToBeSearchedHelp;
00413 
00414                 return array_keys( $wgNamespacesToBeSearchedHelp, true );
00415         }
00416 
00423         function filter( $text ) {
00424                 $lc = $this->legalSearchChars();
00425                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
00426         }
00433         public static function create() {
00434                 global $wgSearchType;
00435                 $dbr = null;
00436                 if ( $wgSearchType ) {
00437                         $class = $wgSearchType;
00438                 } else {
00439                         $dbr = wfGetDB( DB_SLAVE );
00440                         $class = $dbr->getSearchEngine();
00441                 }
00442                 $search = new $class( $dbr );
00443                 $search->setLimitOffset( 0, 0 );
00444                 return $search;
00445         }
00446 
00456         function update( $id, $title, $text ) {
00457                 // no-op
00458         }
00459 
00468         function updateTitle( $id, $title ) {
00469                 // no-op
00470         }
00471 
00477         public static function getOpenSearchTemplate() {
00478                 global $wgOpenSearchTemplate, $wgCanonicalServer;
00479                 if ( $wgOpenSearchTemplate ) {
00480                         return $wgOpenSearchTemplate;
00481                 } else {
00482                         $ns = implode( '|', SearchEngine::defaultNamespaces() );
00483                         if ( !$ns ) {
00484                                 $ns = "0";
00485                         }
00486                         return $wgCanonicalServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace=' . $ns;
00487                 }
00488         }
00489 
00495         public static function getMWSuggestTemplate() {
00496                 global $wgMWSuggestTemplate, $wgServer;
00497                 if ( $wgMWSuggestTemplate )
00498                         return $wgMWSuggestTemplate;
00499                 else
00500                         return $wgServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
00501         }
00502 }
00503 
00507 class SearchResultSet {
00515         function termMatches() {
00516                 return array();
00517         }
00518 
00519         function numRows() {
00520                 return 0;
00521         }
00522 
00529         function hasResults() {
00530                 return false;
00531         }
00532 
00543         function getTotalHits() {
00544                 return null;
00545         }
00546 
00553         function hasSuggestion() {
00554                 return false;
00555         }
00556 
00560         function getSuggestionQuery() {
00561                 return null;
00562         }
00563 
00567         function getSuggestionSnippet() {
00568                 return '';
00569         }
00570 
00577         function getInfo() {
00578                 return null;
00579         }
00580 
00586         function getInterwikiResults() {
00587                 return null;
00588         }
00589 
00595         function hasInterwikiResults() {
00596                 return $this->getInterwikiResults() != null;
00597         }
00598 
00605         function next() {
00606                 return false;
00607         }
00608 
00612         function free() {
00613                 // ...
00614         }
00615 }
00616 
00620 class SqlSearchResultSet extends SearchResultSet {
00621 
00622         protected $mResultSet;
00623 
00624         function __construct( $resultSet, $terms ) {
00625                 $this->mResultSet = $resultSet;
00626                 $this->mTerms = $terms;
00627         }
00628 
00629         function termMatches() {
00630                 return $this->mTerms;
00631         }
00632 
00633         function numRows() {
00634                 if ( $this->mResultSet === false )
00635                         return false;
00636 
00637                 return $this->mResultSet->numRows();
00638         }
00639 
00640         function next() {
00641                 if ( $this->mResultSet === false )
00642                         return false;
00643 
00644                 $row = $this->mResultSet->fetchObject();
00645                 if ( $row === false )
00646                         return false;
00647 
00648                 return SearchResult::newFromRow( $row );
00649         }
00650 
00651         function free() {
00652                 if ( $this->mResultSet === false )
00653                         return false;
00654 
00655                 $this->mResultSet->free();
00656         }
00657 }
00658 
00662 class SearchResultTooMany {
00663         # # Some search engines may bail out if too many matches are found
00664 }
00665 
00666 
00673 class SearchResult {
00674 
00678         var $mRevision = null;
00679         var $mImage = null;
00680 
00684         var $mTitle;
00685 
00689         var $mText;
00690 
00697         public static function newFromTitle( $title ) {
00698                 $result = new self();
00699                 $result->initFromTitle( $title );
00700                 return $result;
00701         }
00708         public static function newFromRow( $row ) {
00709                 $result = new self();
00710                 $result->initFromRow( $row );
00711                 return $result;
00712         }
00713 
00714         public function __construct( $row = null ) {
00715                 if ( !is_null( $row ) ) {
00716                         // Backwards compatibility with pre-1.17 callers
00717                         $this->initFromRow( $row );
00718                 }
00719         }
00720 
00727         protected function initFromRow( $row ) {
00728                 $this->initFromTitle( Title::makeTitle( $row->page_namespace, $row->page_title ) );
00729         }
00730 
00737         protected function initFromTitle( $title ) {
00738                 $this->mTitle = $title;
00739                 if ( !is_null( $this->mTitle ) ) {
00740                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
00741                         if ( $this->mTitle->getNamespace() === NS_FILE )
00742                                 $this->mImage = wfFindFile( $this->mTitle );
00743                 }
00744         }
00745 
00751         function isBrokenTitle() {
00752                 if ( is_null( $this->mTitle ) )
00753                         return true;
00754                 return false;
00755         }
00756 
00762         function isMissingRevision() {
00763                 return !$this->mRevision && !$this->mImage;
00764         }
00765 
00769         function getTitle() {
00770                 return $this->mTitle;
00771         }
00772 
00776         function getScore() {
00777                 return null;
00778         }
00779 
00783         protected function initText() {
00784                 if ( !isset( $this->mText ) ) {
00785                         if ( $this->mRevision != null )
00786                                 $this->mText = $this->mRevision->getText();
00787                         else // TODO: can we fetch raw wikitext for commons images?
00788                                 $this->mText = '';
00789 
00790                 }
00791         }
00792 
00797         function getTextSnippet( $terms ) {
00798                 global $wgUser, $wgAdvancedSearchHighlighting;
00799                 $this->initText();
00800                 list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs( $wgUser );
00801                 $h = new SearchHighlighter();
00802                 if ( $wgAdvancedSearchHighlighting )
00803                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
00804                 else
00805                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
00806         }
00807 
00812         function getTitleSnippet( $terms ) {
00813                 return '';
00814         }
00815 
00820         function getRedirectSnippet( $terms ) {
00821                 return '';
00822         }
00823 
00827         function getRedirectTitle() {
00828                 return null;
00829         }
00830 
00834         function getSectionSnippet() {
00835                 return '';
00836         }
00837 
00841         function getSectionTitle() {
00842                 return null;
00843         }
00844 
00848         function getTimestamp() {
00849                 if ( $this->mRevision )
00850                         return $this->mRevision->getTimestamp();
00851                 elseif ( $this->mImage )
00852                         return $this->mImage->getTimestamp();
00853                 return '';
00854         }
00855 
00859         function getWordCount() {
00860                 $this->initText();
00861                 return str_word_count( $this->mText );
00862         }
00863 
00867         function getByteSize() {
00868                 $this->initText();
00869                 return strlen( $this->mText );
00870         }
00871 
00875         function hasRelated() {
00876                 return false;
00877         }
00878 
00882         function getInterwikiPrefix() {
00883                 return '';
00884         }
00885 }
00889 class SearchNearMatchResultSet extends SearchResultSet {
00890         private $fetched = false;
00894         public function __construct( $match ) {
00895                 $this->result = $match;
00896         }
00897         public function hasResult() {
00898                 return (bool)$this->result;
00899         }
00900         public function numRows() {
00901                 return $this->hasResults() ? 1 : 0;
00902         }
00903         public function next() {
00904                 if ( $this->fetched || !$this->result ) {
00905                         return false;
00906                 }
00907                 $this->fetched = true;
00908                 return SearchResult::newFromTitle( $this->result );
00909         }
00910 }
00911 
00917 class SearchHighlighter {
00918         var $mCleanWikitext = true;
00919 
00920         function __construct( $cleanupWikitext = true ) {
00921                 $this->mCleanWikitext = $cleanupWikitext;
00922         }
00923 
00933         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
00934                 global $wgContLang;
00935                 global $wgSearchHighlightBoundaries;
00936                 $fname = __METHOD__;
00937 
00938                 if ( $text == '' )
00939                         return '';
00940 
00941                 // spli text into text + templates/links/tables
00942                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
00943                 // first capture group is for detecting nested templates/links/tables/references
00944                 $endPatterns = array(
00945                         1 => '/(\{\{)|(\}\})/', // template
00946                         2 => '/(\[\[)|(\]\])/', // image
00947                         3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
00948 
00949                 // @todo FIXME: This should prolly be a hook or something
00950                 if ( function_exists( 'wfCite' ) ) {
00951                         $spat .= '|(<ref>)'; // references via cite extension
00952                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
00953                 }
00954                 $spat .= '/';
00955                 $textExt = array(); // text extracts
00956                 $otherExt = array();  // other extracts
00957                 wfProfileIn( "$fname-split" );
00958                 $start = 0;
00959                 $textLen = strlen( $text );
00960                 $count = 0; // sequence number to maintain ordering
00961                 while ( $start < $textLen ) {
00962                         // find start of template/image/table
00963                         if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
00964                                 $epat = '';
00965                                 foreach ( $matches as $key => $val ) {
00966                                         if ( $key > 0 && $val[1] != - 1 ) {
00967                                                 if ( $key == 2 ) {
00968                                                         // see if this is an image link
00969                                                         $ns = substr( $val[0], 2, - 1 );
00970                                                         if ( $wgContLang->getNsIndex( $ns ) != NS_FILE )
00971                                                                 break;
00972 
00973                                                 }
00974                                                 $epat = $endPatterns[$key];
00975                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
00976                                                 $start = $val[1];
00977                                                 break;
00978                                         }
00979                                 }
00980                                 if ( $epat ) {
00981                                         // find end (and detect any nested elements)
00982                                         $level = 0;
00983                                         $offset = $start + 1;
00984                                         $found = false;
00985                                         while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
00986                                                 if ( array_key_exists( 2, $endMatches ) ) {
00987                                                         // found end
00988                                                         if ( $level == 0 ) {
00989                                                                 $len = strlen( $endMatches[2][0] );
00990                                                                 $off = $endMatches[2][1];
00991                                                                 $this->splitAndAdd( $otherExt, $count,
00992                                                                         substr( $text, $start, $off + $len  - $start ) );
00993                                                                 $start = $off + $len;
00994                                                                 $found = true;
00995                                                                 break;
00996                                                         } else {
00997                                                                 // end of nested element
00998                                                                 $level -= 1;
00999                                                         }
01000                                                 } else {
01001                                                         // nested
01002                                                         $level += 1;
01003                                                 }
01004                                                 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
01005                                         }
01006                                         if ( ! $found ) {
01007                                                 // couldn't find appropriate closing tag, skip
01008                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
01009                                                 $start += strlen( $matches[0][0] );
01010                                         }
01011                                         continue;
01012                                 }
01013                         }
01014                         // else: add as text extract
01015                         $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
01016                         break;
01017                 }
01018 
01019                 $all = $textExt + $otherExt; // these have disjunct key sets
01020 
01021                 wfProfileOut( "$fname-split" );
01022 
01023                 // prepare regexps
01024                 foreach ( $terms as $index => $term ) {
01025                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
01026                         if ( preg_match( '/[\x80-\xff]/', $term ) ) {
01027                                 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] );
01028                         } else {
01029                                 $terms[$index] = $term;
01030                         }
01031                 }
01032                 $anyterm = implode( '|', $terms );
01033                 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
01034 
01035                 // @todo FIXME: A hack to scale contextchars, a correct solution
01036                 // would be to have contextchars actually be char and not byte
01037                 // length, and do proper utf-8 substrings and lengths everywhere,
01038                 // but PHP is making that very hard and unclean to implement :(
01039                 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
01040                 $contextchars = intval( $contextchars * $scale );
01041 
01042                 $patPre = "(^|$wgSearchHighlightBoundaries)";
01043                 $patPost = "($wgSearchHighlightBoundaries|$)";
01044 
01045                 $pat1 = "/(" . $phrase . ")/ui";
01046                 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
01047 
01048                 wfProfileIn( "$fname-extract" );
01049 
01050                 $left = $contextlines;
01051 
01052                 $snippets = array();
01053                 $offsets = array();
01054 
01055                 // show beginning only if it contains all words
01056                 $first = 0;
01057                 $firstText = '';
01058                 foreach ( $textExt as $index => $line ) {
01059                         if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
01060                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
01061                                 $first = $index;
01062                                 break;
01063                         }
01064                 }
01065                 if ( $firstText ) {
01066                         $succ = true;
01067                         // check if first text contains all terms
01068                         foreach ( $terms as $term ) {
01069                                 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
01070                                         $succ = false;
01071                                         break;
01072                                 }
01073                         }
01074                         if ( $succ ) {
01075                                 $snippets[$first] = $firstText;
01076                                 $offsets[$first] = 0;
01077                         }
01078                 }
01079                 if ( ! $snippets ) {
01080                         // match whole query on text
01081                         $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
01082                         // match whole query on templates/tables/images
01083                         $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
01084                         // match any words on text
01085                         $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
01086                         // match any words on templates/tables/images
01087                         $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
01088 
01089                         ksort( $snippets );
01090                 }
01091 
01092                 // add extra chars to each snippet to make snippets constant size
01093                 $extended = array();
01094                 if ( count( $snippets ) == 0 ) {
01095                         // couldn't find the target words, just show beginning of article
01096                         if ( array_key_exists( $first, $all ) ) {
01097                                 $targetchars = $contextchars * $contextlines;
01098                                 $snippets[$first] = '';
01099                                 $offsets[$first] = 0;
01100                         }
01101                 } else {
01102                         // if begin of the article contains the whole phrase, show only that !!
01103                         if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
01104                                 && $offsets[$first] < $contextchars * 2 ) {
01105                                 $snippets = array ( $first => $snippets[$first] );
01106                         }
01107 
01108                         // calc by how much to extend existing snippets
01109                         $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
01110                 }
01111 
01112                 foreach ( $snippets as $index => $line ) {
01113                         $extended[$index] = $line;
01114                         $len = strlen( $line );
01115                         if ( $len < $targetchars - 20 ) {
01116                                 // complete this line
01117                                 if ( $len < strlen( $all[$index] ) ) {
01118                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] );
01119                                         $len = strlen( $extended[$index] );
01120                                 }
01121 
01122                                 // add more lines
01123                                 $add = $index + 1;
01124                                 while ( $len < $targetchars - 20
01125                                            && array_key_exists( $add, $all )
01126                                            && !array_key_exists( $add, $snippets ) ) {
01127                                         $offsets[$add] = 0;
01128                                         $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
01129                                         $extended[$add] = $tt;
01130                                         $len += strlen( $tt );
01131                                         $add++;
01132                                 }
01133                         }
01134                 }
01135 
01136                 // $snippets = array_map('htmlspecialchars', $extended);
01137                 $snippets = $extended;
01138                 $last = - 1;
01139                 $extract = '';
01140                 foreach ( $snippets as $index => $line ) {
01141                         if ( $last == - 1 )
01142                                 $extract .= $line; // first line
01143                         elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) )
01144                                 $extract .= " " . $line; // continous lines
01145                         else
01146                                 $extract .= '<b> ... </b>' . $line;
01147 
01148                         $last = $index;
01149                 }
01150                 if ( $extract )
01151                         $extract .= '<b> ... </b>';
01152 
01153                 $processed = array();
01154                 foreach ( $terms as $term ) {
01155                         if ( ! isset( $processed[$term] ) ) {
01156                                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
01157                                 $extract = preg_replace( $pat3,
01158                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
01159                                 $processed[$term] = true;
01160                         }
01161                 }
01162 
01163                 wfProfileOut( "$fname-extract" );
01164 
01165                 return $extract;
01166         }
01167 
01175         function splitAndAdd( &$extracts, &$count, $text ) {
01176                 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
01177                 foreach ( $split as $line ) {
01178                         $tt = trim( $line );
01179                         if ( $tt )
01180                                 $extracts[$count++] = $tt;
01181                 }
01182         }
01183 
01189         function caseCallback( $matches ) {
01190                 global $wgContLang;
01191                 if ( strlen( $matches[0] ) > 1 ) {
01192                         return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
01193                 } else {
01194                         return $matches[0];
01195                 }
01196         }
01197 
01208         function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
01209                 if ( $start != 0 ) {
01210                         $start = $this->position( $text, $start, 1 );
01211                 }
01212                 if ( $end >= strlen( $text ) ) {
01213                         $end = strlen( $text );
01214                 } else {
01215                         $end = $this->position( $text, $end );
01216                 }
01217 
01218                 if ( !is_null( $posStart ) ) {
01219                         $posStart = $start;
01220                 }
01221                 if ( !is_null( $posEnd ) ) {
01222                         $posEnd = $end;
01223                 }
01224 
01225                 if ( $end > $start )  {
01226                         return substr( $text, $start, $end - $start );
01227                 } else {
01228                         return '';
01229                 }
01230         }
01231 
01240         function position( $text, $point, $offset = 0 ) {
01241                 $tolerance = 10;
01242                 $s = max( 0, $point - $tolerance );
01243                 $l = min( strlen( $text ), $point + $tolerance ) - $s;
01244                 $m = array();
01245                 if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) {
01246                         return $m[0][1] + $s + $offset;
01247                 } else {
01248                         // check if point is on a valid first UTF8 char
01249                         $char = ord( $text[$point] );
01250                         while ( $char >= 0x80 && $char < 0xc0 ) {
01251                                 // skip trailing bytes
01252                                 $point++;
01253                                 if ( $point >= strlen( $text ) )
01254                                         return strlen( $text );
01255                                 $char = ord( $text[$point] );
01256                         }
01257                         return $point;
01258 
01259                 }
01260         }
01261 
01273         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
01274                 if ( $linesleft == 0 )
01275                         return; // nothing to do
01276                 foreach ( $extracts as $index => $line ) {
01277                         if ( array_key_exists( $index, $out ) )
01278                                 continue; // this line already highlighted
01279 
01280                         $m = array();
01281                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
01282                                 continue;
01283 
01284                         $offset = $m[0][1];
01285                         $len = strlen( $m[0][0] );
01286                         if ( $offset + $len < $contextchars )
01287                                 $begin = 0;
01288                         elseif ( $len > $contextchars )
01289                                 $begin = $offset;
01290                         else
01291                                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
01292 
01293                         $end = $begin + $contextchars;
01294 
01295                         $posBegin = $begin;
01296                         // basic snippet from this line
01297                         $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
01298                         $offsets[$index] = $posBegin;
01299                         $linesleft--;
01300                         if ( $linesleft == 0 )
01301                                 return;
01302                 }
01303         }
01304 
01309         function removeWiki( $text ) {
01310                 $fname = __METHOD__;
01311                 wfProfileIn( $fname );
01312 
01313                 // $text = preg_replace("/'{2,5}/", "", $text);
01314                 // $text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
01315                 // $text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
01316                 // $text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
01317                 // $text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
01318                 // $text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
01319                 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
01320                 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
01321                 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
01322                 $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text );
01323                 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
01324                 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
01325                 $text = preg_replace( "/'''''/", "", $text );
01326                 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
01327                 $text = preg_replace( "/''/", "", $text );
01328 
01329                 wfProfileOut( $fname );
01330                 return $text;
01331         }
01332 
01339         function linkReplace( $matches ) {
01340                 $colon = strpos( $matches[1], ':' );
01341                 if ( $colon === false )
01342                         return $matches[2]; // replace with caption
01343                 global $wgContLang;
01344                 $ns = substr( $matches[1], 0, $colon );
01345                 $index = $wgContLang->getNsIndex( $ns );
01346                 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) )
01347                         return $matches[0]; // return the whole thing
01348                 else
01349                         return $matches[2];
01350 
01351         }
01352 
01363         public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
01364                 global $wgContLang;
01365                 $fname = __METHOD__;
01366 
01367                 $lines = explode( "\n", $text );
01368 
01369                 $terms = implode( '|', $terms );
01370                 $max = intval( $contextchars ) + 1;
01371                 $pat1 = "/(.*)($terms)(.{0,$max})/i";
01372 
01373                 $lineno = 0;
01374 
01375                 $extract = "";
01376                 wfProfileIn( "$fname-extract" );
01377                 foreach ( $lines as $line ) {
01378                         if ( 0 == $contextlines ) {
01379                                 break;
01380                         }
01381                         ++$lineno;
01382                         $m = array();
01383                         if ( ! preg_match( $pat1, $line, $m ) ) {
01384                                 continue;
01385                         }
01386                         --$contextlines;
01387                         // truncate function changes ... to relevant i18n message.
01388                         $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
01389 
01390                         if ( count( $m ) < 3 ) {
01391                                 $post = '';
01392                         } else {
01393                                 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
01394                         }
01395 
01396                         $found = $m[2];
01397 
01398                         $line = htmlspecialchars( $pre . $found . $post );
01399                         $pat2 = '/(' . $terms . ")/i";
01400                         $line = preg_replace( $pat2,
01401                           "<span class='searchmatch'>\\1</span>", $line );
01402 
01403                         $extract .= "${line}\n";
01404                 }
01405                 wfProfileOut( "$fname-extract" );
01406 
01407                 return $extract;
01408         }
01409 
01410 }
01411 
01418 class SearchEngineDummy extends SearchEngine {
01419         // no-op
01420 }