MediaWiki  REL1_21
SearchEngine.php
Go to the documentation of this file.
00001 <?php
00032 class SearchEngine {
00033         var $limit = 10;
00034         var $offset = 0;
00035         var $prefix = '';
00036         var $searchTerms = array();
00037         var $namespaces = array( NS_MAIN );
00038         var $showRedirects = false;
00039 
00041         protected $features = array();
00042 
00046         protected $db;
00047 
00048         function __construct( $db = null ) {
00049                 if ( $db ) {
00050                         $this->db = $db;
00051                 } else {
00052                         $this->db = wfGetDB( DB_SLAVE );
00053                 }
00054         }
00055 
00064         function searchText( $term ) {
00065                 return null;
00066         }
00067 
00076         function searchTitle( $term ) {
00077                 return null;
00078         }
00079 
00085         function acceptListRedirects() {
00086                 wfDeprecated( __METHOD__, '1.18' );
00087                 return $this->supports( 'list-redirects' );
00088         }
00089 
00095         public function supports( $feature ) {
00096                 switch( $feature ) {
00097                 case 'list-redirects':
00098                         return true;
00099                 case 'title-suffix-filter':
00100                 default:
00101                         return false;
00102                 }
00103         }
00104 
00112         public function setFeatureData( $feature, $data ) {
00113                 $this->features[$feature] = $data;
00114         }
00115 
00124         public function normalizeText( $string ) {
00125                 global $wgContLang;
00126 
00127                 // Some languages such as Chinese require word segmentation
00128                 return $wgContLang->segmentByWord( $string );
00129         }
00130 
00135         function transformSearchTerm( $term ) {
00136                 return $term;
00137         }
00138 
00146         public static function getNearMatch( $searchterm ) {
00147                 $title = self::getNearMatchInternal( $searchterm );
00148 
00149                 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) );
00150                 return $title;
00151         }
00152 
00160         public static function getNearMatchResultSet( $searchterm ) {
00161                 return new SearchNearMatchResultSet( self::getNearMatch( $searchterm ) );
00162         }
00163 
00168         private static function getNearMatchInternal( $searchterm ) {
00169                 global $wgContLang, $wgEnableSearchContributorsByIP;
00170 
00171                 $allSearchTerms = array( $searchterm );
00172 
00173                 if ( $wgContLang->hasVariants() ) {
00174                         $allSearchTerms = array_merge( $allSearchTerms, $wgContLang->autoConvertToAllVariants( $searchterm ) );
00175                 }
00176 
00177                 $titleResult = null;
00178                 if ( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) {
00179                         return $titleResult;
00180                 }
00181 
00182                 foreach ( $allSearchTerms as $term ) {
00183 
00184                         # Exact match? No need to look further.
00185                         $title = Title::newFromText( $term );
00186                         if ( is_null( $title ) ) {
00187                                 return null;
00188                         }
00189 
00190                         # Try files if searching in the Media: namespace
00191                         if ( $title->getNamespace() == NS_MEDIA ) {
00192                                 $title = Title::makeTitle( NS_FILE, $title->getText() );
00193                         }
00194 
00195                         if ( $title->isSpecialPage() || $title->isExternal() || $title->exists() ) {
00196                                 return $title;
00197                         }
00198 
00199                         # See if it still otherwise has content is some sane sense
00200                         $page = WikiPage::factory( $title );
00201                         if ( $page->hasViewableContent() ) {
00202                                 return $title;
00203                         }
00204 
00205                         if ( !wfRunHooks( 'SearchAfterNoDirectMatch', array( $term, &$title ) ) ) {
00206                                 return $title;
00207                         }
00208 
00209                         # Now try all lower case (i.e. first letter capitalized)
00210                         $title = Title::newFromText( $wgContLang->lc( $term ) );
00211                         if ( $title && $title->exists() ) {
00212                                 return $title;
00213                         }
00214 
00215                         # Now try capitalized string
00216                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
00217                         if ( $title && $title->exists() ) {
00218                                 return $title;
00219                         }
00220 
00221                         # Now try all upper case
00222                         $title = Title::newFromText( $wgContLang->uc( $term ) );
00223                         if ( $title && $title->exists() ) {
00224                                 return $title;
00225                         }
00226 
00227                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
00228                         $title = Title::newFromText( $wgContLang->ucwordbreaks( $term ) );
00229                         if ( $title && $title->exists() ) {
00230                                 return $title;
00231                         }
00232 
00233                         // Give hooks a chance at better match variants
00234                         $title = null;
00235                         if ( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
00236                                 return $title;
00237                         }
00238                 }
00239 
00240                 $title = Title::newFromText( $searchterm );
00241 
00242                 # Entering an IP address goes to the contributions page
00243                 if ( $wgEnableSearchContributorsByIP ) {
00244                         if ( ( $title->getNamespace() == NS_USER && User::isIP( $title->getText() ) )
00245                                 || User::isIP( trim( $searchterm ) ) ) {
00246                                 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
00247                         }
00248                 }
00249 
00250                 # Entering a user goes to the user page whether it's there or not
00251                 if ( $title->getNamespace() == NS_USER ) {
00252                         return $title;
00253                 }
00254 
00255                 # Go to images that exist even if there's no local page.
00256                 # There may have been a funny upload, or it may be on a shared
00257                 # file repository such as Wikimedia Commons.
00258                 if ( $title->getNamespace() == NS_FILE ) {
00259                         $image = wfFindFile( $title );
00260                         if ( $image ) {
00261                                 return $title;
00262                         }
00263                 }
00264 
00265                 # MediaWiki namespace? Page may be "implied" if not customized.
00266                 # Just return it, with caps forced as the message system likes it.
00267                 if ( $title->getNamespace() == NS_MEDIAWIKI ) {
00268                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
00269                 }
00270 
00271                 # Quoted term? Try without the quotes...
00272                 $matches = array();
00273                 if ( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
00274                         return SearchEngine::getNearMatch( $matches[1] );
00275                 }
00276 
00277                 return null;
00278         }
00279 
00280         public static function legalSearchChars() {
00281                 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
00282         }
00283 
00291         function setLimitOffset( $limit, $offset = 0 ) {
00292                 $this->limit = intval( $limit );
00293                 $this->offset = intval( $offset );
00294         }
00295 
00302         function setNamespaces( $namespaces ) {
00303                 $this->namespaces = $namespaces;
00304         }
00305 
00313         function replacePrefixes( $query ) {
00314                 global $wgContLang;
00315 
00316                 $parsed = $query;
00317                 if ( strpos( $query, ':' ) === false ) { // nothing to do
00318                         wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
00319                         return $parsed;
00320                 }
00321 
00322                 $allkeyword = wfMessage( 'searchall' )->inContentLanguage()->text() . ":";
00323                 if ( strncmp( $query, $allkeyword, strlen( $allkeyword ) ) == 0 ) {
00324                         $this->namespaces = null;
00325                         $parsed = substr( $query, strlen( $allkeyword ) );
00326                 } elseif ( strpos( $query, ':' ) !== false ) {
00327                         $prefix = substr( $query, 0, strpos( $query, ':' ) );
00328                         $index = $wgContLang->getNsIndex( $prefix );
00329                         if ( $index !== false ) {
00330                                 $this->namespaces = array( $index );
00331                                 $parsed = substr( $query, strlen( $prefix ) + 1 );
00332                         }
00333                 }
00334                 if ( trim( $parsed ) == '' )
00335                         $parsed = $query; // prefix was the whole query
00336 
00337                 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
00338 
00339                 return $parsed;
00340         }
00341 
00346         public static function searchableNamespaces() {
00347                 global $wgContLang;
00348                 $arr = array();
00349                 foreach ( $wgContLang->getNamespaces() as $ns => $name ) {
00350                         if ( $ns >= NS_MAIN ) {
00351                                 $arr[$ns] = $name;
00352                         }
00353                 }
00354 
00355                 wfRunHooks( 'SearchableNamespaces', array( &$arr ) );
00356                 return $arr;
00357         }
00358 
00366         public static function userNamespaces( $user ) {
00367                 global $wgSearchEverythingOnlyLoggedIn;
00368 
00369                 $searchableNamespaces = SearchEngine::searchableNamespaces();
00370 
00371                 // get search everything preference, that can be set to be read for logged-in users
00372                 // it overrides other options
00373                 if ( !$wgSearchEverythingOnlyLoggedIn || $user->isLoggedIn() ) {
00374                         if ( $user->getOption( 'searcheverything' ) ) {
00375                                 return array_keys( $searchableNamespaces );
00376                         }
00377                 }
00378 
00379                 $arr = array();
00380                 foreach ( $searchableNamespaces as $ns => $name ) {
00381                         if ( $user->getOption( 'searchNs' . $ns ) ) {
00382                                 $arr[] = $ns;
00383                         }
00384                 }
00385 
00386                 return $arr;
00387         }
00388 
00394         public static function userHighlightPrefs() {
00395                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
00396                 $contextchars = 75; // same as above.... :P
00397                 return array( $contextlines, $contextchars );
00398         }
00399 
00405         public static function defaultNamespaces() {
00406                 global $wgNamespacesToBeSearchedDefault;
00407 
00408                 return array_keys( $wgNamespacesToBeSearchedDefault, true );
00409         }
00410 
00418         public static function namespacesAsText( $namespaces ) {
00419                 global $wgContLang;
00420 
00421                 $formatted = array_map( array( $wgContLang, 'getFormattedNsText' ), $namespaces );
00422                 foreach ( $formatted as $key => $ns ) {
00423                         if ( empty( $ns ) )
00424                                 $formatted[$key] = wfMessage( 'blanknamespace' )->text();
00425                 }
00426                 return $formatted;
00427         }
00428 
00434         public static function helpNamespaces() {
00435                 global $wgNamespacesToBeSearchedHelp;
00436 
00437                 return array_keys( $wgNamespacesToBeSearchedHelp, true );
00438         }
00439 
00446         function filter( $text ) {
00447                 $lc = $this->legalSearchChars();
00448                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
00449         }
00456         public static function create() {
00457                 global $wgSearchType;
00458                 $dbr = null;
00459                 if ( $wgSearchType ) {
00460                         $class = $wgSearchType;
00461                 } else {
00462                         $dbr = wfGetDB( DB_SLAVE );
00463                         $class = $dbr->getSearchEngine();
00464                 }
00465                 $search = new $class( $dbr );
00466                 $search->setLimitOffset( 0, 0 );
00467                 return $search;
00468         }
00469 
00479         function update( $id, $title, $text ) {
00480                 // no-op
00481         }
00482 
00491         function updateTitle( $id, $title ) {
00492                 // no-op
00493         }
00494 
00500         public static function getOpenSearchTemplate() {
00501                 global $wgOpenSearchTemplate, $wgCanonicalServer;
00502                 if ( $wgOpenSearchTemplate ) {
00503                         return $wgOpenSearchTemplate;
00504                 } else {
00505                         $ns = implode( '|', SearchEngine::defaultNamespaces() );
00506                         if ( !$ns ) {
00507                                 $ns = "0";
00508                         }
00509                         return $wgCanonicalServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace=' . $ns;
00510                 }
00511         }
00512 }
00513 
00517 class SearchResultSet {
00525         function termMatches() {
00526                 return array();
00527         }
00528 
00529         function numRows() {
00530                 return 0;
00531         }
00532 
00539         function hasResults() {
00540                 return false;
00541         }
00542 
00553         function getTotalHits() {
00554                 return null;
00555         }
00556 
00563         function hasSuggestion() {
00564                 return false;
00565         }
00566 
00570         function getSuggestionQuery() {
00571                 return null;
00572         }
00573 
00577         function getSuggestionSnippet() {
00578                 return '';
00579         }
00580 
00587         function getInfo() {
00588                 return null;
00589         }
00590 
00596         function getInterwikiResults() {
00597                 return null;
00598         }
00599 
00605         function hasInterwikiResults() {
00606                 return $this->getInterwikiResults() != null;
00607         }
00608 
00615         function next() {
00616                 return false;
00617         }
00618 
00622         function free() {
00623                 // ...
00624         }
00625 }
00626 
00630 class SqlSearchResultSet extends SearchResultSet {
00631 
00632         protected $mResultSet;
00633 
00634         function __construct( $resultSet, $terms ) {
00635                 $this->mResultSet = $resultSet;
00636                 $this->mTerms = $terms;
00637         }
00638 
00639         function termMatches() {
00640                 return $this->mTerms;
00641         }
00642 
00643         function numRows() {
00644                 if ( $this->mResultSet === false )
00645                         return false;
00646 
00647                 return $this->mResultSet->numRows();
00648         }
00649 
00650         function next() {
00651                 if ( $this->mResultSet === false )
00652                         return false;
00653 
00654                 $row = $this->mResultSet->fetchObject();
00655                 if ( $row === false )
00656                         return false;
00657 
00658                 return SearchResult::newFromRow( $row );
00659         }
00660 
00661         function free() {
00662                 if ( $this->mResultSet === false )
00663                         return false;
00664 
00665                 $this->mResultSet->free();
00666         }
00667 }
00668 
00672 class SearchResultTooMany {
00673         # # Some search engines may bail out if too many matches are found
00674 }
00675 
00682 class SearchResult {
00683 
00687         var $mRevision = null;
00688         var $mImage = null;
00689 
00693         var $mTitle;
00694 
00698         var $mText;
00699 
00706         public static function newFromTitle( $title ) {
00707                 $result = new self();
00708                 $result->initFromTitle( $title );
00709                 return $result;
00710         }
00717         public static function newFromRow( $row ) {
00718                 $result = new self();
00719                 $result->initFromRow( $row );
00720                 return $result;
00721         }
00722 
00723         public function __construct( $row = null ) {
00724                 if ( !is_null( $row ) ) {
00725                         // Backwards compatibility with pre-1.17 callers
00726                         $this->initFromRow( $row );
00727                 }
00728         }
00729 
00736         protected function initFromRow( $row ) {
00737                 $this->initFromTitle( Title::makeTitle( $row->page_namespace, $row->page_title ) );
00738         }
00739 
00746         protected function initFromTitle( $title ) {
00747                 $this->mTitle = $title;
00748                 if ( !is_null( $this->mTitle ) ) {
00749                         $id = false;
00750                         wfRunHooks( 'SearchResultInitFromTitle', array( $title, &$id ) );
00751                         $this->mRevision = Revision::newFromTitle(
00752                                 $this->mTitle, $id, Revision::READ_NORMAL );
00753                         if ( $this->mTitle->getNamespace() === NS_FILE )
00754                                 $this->mImage = wfFindFile( $this->mTitle );
00755                 }
00756         }
00757 
00763         function isBrokenTitle() {
00764                 if ( is_null( $this->mTitle ) )
00765                         return true;
00766                 return false;
00767         }
00768 
00774         function isMissingRevision() {
00775                 return !$this->mRevision && !$this->mImage;
00776         }
00777 
00781         function getTitle() {
00782                 return $this->mTitle;
00783         }
00784 
00788         function getScore() {
00789                 return null;
00790         }
00791 
00795         protected function initText() {
00796                 if ( !isset( $this->mText ) ) {
00797                         if ( $this->mRevision != null ) {
00798                                 //TODO: if we could plug in some code that knows about special content models *and* about
00799                                 //      special features of the search engine, the search could benefit.
00800                                 $content = $this->mRevision->getContent();
00801                                 $this->mText = $content ? $content->getTextForSearchIndex() : '';
00802                         } else { // TODO: can we fetch raw wikitext for commons images?
00803                                 $this->mText = '';
00804                         }
00805                 }
00806         }
00807 
00812         function getTextSnippet( $terms ) {
00813                 global $wgUser, $wgAdvancedSearchHighlighting;
00814                 $this->initText();
00815 
00816                 // TODO: make highliter take a content object. Make ContentHandler a factory for SearchHighliter.
00817                 list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs( $wgUser );
00818                 $h = new SearchHighlighter();
00819                 if ( $wgAdvancedSearchHighlighting )
00820                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
00821                 else
00822                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
00823         }
00824 
00829         function getTitleSnippet( $terms ) {
00830                 return '';
00831         }
00832 
00837         function getRedirectSnippet( $terms ) {
00838                 return '';
00839         }
00840 
00844         function getRedirectTitle() {
00845                 return null;
00846         }
00847 
00851         function getSectionSnippet() {
00852                 return '';
00853         }
00854 
00858         function getSectionTitle() {
00859                 return null;
00860         }
00861 
00865         function getTimestamp() {
00866                 if ( $this->mRevision )
00867                         return $this->mRevision->getTimestamp();
00868                 elseif ( $this->mImage )
00869                         return $this->mImage->getTimestamp();
00870                 return '';
00871         }
00872 
00876         function getWordCount() {
00877                 $this->initText();
00878                 return str_word_count( $this->mText );
00879         }
00880 
00884         function getByteSize() {
00885                 $this->initText();
00886                 return strlen( $this->mText );
00887         }
00888 
00892         function hasRelated() {
00893                 return false;
00894         }
00895 
00899         function getInterwikiPrefix() {
00900                 return '';
00901         }
00902 }
00906 class SearchNearMatchResultSet extends SearchResultSet {
00907         private $fetched = false;
00911         public function __construct( $match ) {
00912                 $this->result = $match;
00913         }
00914         public function hasResult() {
00915                 return (bool)$this->result;
00916         }
00917         public function numRows() {
00918                 return $this->hasResults() ? 1 : 0;
00919         }
00920         public function next() {
00921                 if ( $this->fetched || !$this->result ) {
00922                         return false;
00923                 }
00924                 $this->fetched = true;
00925                 return SearchResult::newFromTitle( $this->result );
00926         }
00927 }
00928 
00934 class SearchHighlighter {
00935         var $mCleanWikitext = true;
00936 
00937         function __construct( $cleanupWikitext = true ) {
00938                 $this->mCleanWikitext = $cleanupWikitext;
00939         }
00940 
00950         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
00951                 global $wgContLang;
00952                 global $wgSearchHighlightBoundaries;
00953                 $fname = __METHOD__;
00954 
00955                 if ( $text == '' )
00956                         return '';
00957 
00958                 // spli text into text + templates/links/tables
00959                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
00960                 // first capture group is for detecting nested templates/links/tables/references
00961                 $endPatterns = array(
00962                         1 => '/(\{\{)|(\}\})/', // template
00963                         2 => '/(\[\[)|(\]\])/', // image
00964                         3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
00965 
00966                 // @todo FIXME: This should prolly be a hook or something
00967                 if ( function_exists( 'wfCite' ) ) {
00968                         $spat .= '|(<ref>)'; // references via cite extension
00969                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
00970                 }
00971                 $spat .= '/';
00972                 $textExt = array(); // text extracts
00973                 $otherExt = array(); // other extracts
00974                 wfProfileIn( "$fname-split" );
00975                 $start = 0;
00976                 $textLen = strlen( $text );
00977                 $count = 0; // sequence number to maintain ordering
00978                 while ( $start < $textLen ) {
00979                         // find start of template/image/table
00980                         if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
00981                                 $epat = '';
00982                                 foreach ( $matches as $key => $val ) {
00983                                         if ( $key > 0 && $val[1] != - 1 ) {
00984                                                 if ( $key == 2 ) {
00985                                                         // see if this is an image link
00986                                                         $ns = substr( $val[0], 2, - 1 );
00987                                                         if ( $wgContLang->getNsIndex( $ns ) != NS_FILE )
00988                                                                 break;
00989 
00990                                                 }
00991                                                 $epat = $endPatterns[$key];
00992                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
00993                                                 $start = $val[1];
00994                                                 break;
00995                                         }
00996                                 }
00997                                 if ( $epat ) {
00998                                         // find end (and detect any nested elements)
00999                                         $level = 0;
01000                                         $offset = $start + 1;
01001                                         $found = false;
01002                                         while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
01003                                                 if ( array_key_exists( 2, $endMatches ) ) {
01004                                                         // found end
01005                                                         if ( $level == 0 ) {
01006                                                                 $len = strlen( $endMatches[2][0] );
01007                                                                 $off = $endMatches[2][1];
01008                                                                 $this->splitAndAdd( $otherExt, $count,
01009                                                                         substr( $text, $start, $off + $len  - $start ) );
01010                                                                 $start = $off + $len;
01011                                                                 $found = true;
01012                                                                 break;
01013                                                         } else {
01014                                                                 // end of nested element
01015                                                                 $level -= 1;
01016                                                         }
01017                                                 } else {
01018                                                         // nested
01019                                                         $level += 1;
01020                                                 }
01021                                                 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
01022                                         }
01023                                         if ( ! $found ) {
01024                                                 // couldn't find appropriate closing tag, skip
01025                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
01026                                                 $start += strlen( $matches[0][0] );
01027                                         }
01028                                         continue;
01029                                 }
01030                         }
01031                         // else: add as text extract
01032                         $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
01033                         break;
01034                 }
01035 
01036                 $all = $textExt + $otherExt; // these have disjunct key sets
01037 
01038                 wfProfileOut( "$fname-split" );
01039 
01040                 // prepare regexps
01041                 foreach ( $terms as $index => $term ) {
01042                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
01043                         if ( preg_match( '/[\x80-\xff]/', $term ) ) {
01044                                 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] );
01045                         } else {
01046                                 $terms[$index] = $term;
01047                         }
01048                 }
01049                 $anyterm = implode( '|', $terms );
01050                 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
01051 
01052                 // @todo FIXME: A hack to scale contextchars, a correct solution
01053                 // would be to have contextchars actually be char and not byte
01054                 // length, and do proper utf-8 substrings and lengths everywhere,
01055                 // but PHP is making that very hard and unclean to implement :(
01056                 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
01057                 $contextchars = intval( $contextchars * $scale );
01058 
01059                 $patPre = "(^|$wgSearchHighlightBoundaries)";
01060                 $patPost = "($wgSearchHighlightBoundaries|$)";
01061 
01062                 $pat1 = "/(" . $phrase . ")/ui";
01063                 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
01064 
01065                 wfProfileIn( "$fname-extract" );
01066 
01067                 $left = $contextlines;
01068 
01069                 $snippets = array();
01070                 $offsets = array();
01071 
01072                 // show beginning only if it contains all words
01073                 $first = 0;
01074                 $firstText = '';
01075                 foreach ( $textExt as $index => $line ) {
01076                         if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
01077                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
01078                                 $first = $index;
01079                                 break;
01080                         }
01081                 }
01082                 if ( $firstText ) {
01083                         $succ = true;
01084                         // check if first text contains all terms
01085                         foreach ( $terms as $term ) {
01086                                 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
01087                                         $succ = false;
01088                                         break;
01089                                 }
01090                         }
01091                         if ( $succ ) {
01092                                 $snippets[$first] = $firstText;
01093                                 $offsets[$first] = 0;
01094                         }
01095                 }
01096                 if ( ! $snippets ) {
01097                         // match whole query on text
01098                         $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
01099                         // match whole query on templates/tables/images
01100                         $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
01101                         // match any words on text
01102                         $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
01103                         // match any words on templates/tables/images
01104                         $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
01105 
01106                         ksort( $snippets );
01107                 }
01108 
01109                 // add extra chars to each snippet to make snippets constant size
01110                 $extended = array();
01111                 if ( count( $snippets ) == 0 ) {
01112                         // couldn't find the target words, just show beginning of article
01113                         if ( array_key_exists( $first, $all ) ) {
01114                                 $targetchars = $contextchars * $contextlines;
01115                                 $snippets[$first] = '';
01116                                 $offsets[$first] = 0;
01117                         }
01118                 } else {
01119                         // if begin of the article contains the whole phrase, show only that !!
01120                         if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
01121                                 && $offsets[$first] < $contextchars * 2 ) {
01122                                 $snippets = array ( $first => $snippets[$first] );
01123                         }
01124 
01125                         // calc by how much to extend existing snippets
01126                         $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
01127                 }
01128 
01129                 foreach ( $snippets as $index => $line ) {
01130                         $extended[$index] = $line;
01131                         $len = strlen( $line );
01132                         if ( $len < $targetchars - 20 ) {
01133                                 // complete this line
01134                                 if ( $len < strlen( $all[$index] ) ) {
01135                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] );
01136                                         $len = strlen( $extended[$index] );
01137                                 }
01138 
01139                                 // add more lines
01140                                 $add = $index + 1;
01141                                 while ( $len < $targetchars - 20
01142                                                 && array_key_exists( $add, $all )
01143                                                 && !array_key_exists( $add, $snippets ) ) {
01144                                         $offsets[$add] = 0;
01145                                         $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
01146                                         $extended[$add] = $tt;
01147                                         $len += strlen( $tt );
01148                                         $add++;
01149                                 }
01150                         }
01151                 }
01152 
01153                 // $snippets = array_map( 'htmlspecialchars', $extended );
01154                 $snippets = $extended;
01155                 $last = - 1;
01156                 $extract = '';
01157                 foreach ( $snippets as $index => $line ) {
01158                         if ( $last == - 1 )
01159                                 $extract .= $line; // first line
01160                         elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) )
01161                                 $extract .= " " . $line; // continous lines
01162                         else
01163                                 $extract .= '<b> ... </b>' . $line;
01164 
01165                         $last = $index;
01166                 }
01167                 if ( $extract )
01168                         $extract .= '<b> ... </b>';
01169 
01170                 $processed = array();
01171                 foreach ( $terms as $term ) {
01172                         if ( ! isset( $processed[$term] ) ) {
01173                                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
01174                                 $extract = preg_replace( $pat3,
01175                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
01176                                 $processed[$term] = true;
01177                         }
01178                 }
01179 
01180                 wfProfileOut( "$fname-extract" );
01181 
01182                 return $extract;
01183         }
01184 
01192         function splitAndAdd( &$extracts, &$count, $text ) {
01193                 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
01194                 foreach ( $split as $line ) {
01195                         $tt = trim( $line );
01196                         if ( $tt )
01197                                 $extracts[$count++] = $tt;
01198                 }
01199         }
01200 
01207         function caseCallback( $matches ) {
01208                 global $wgContLang;
01209                 if ( strlen( $matches[0] ) > 1 ) {
01210                         return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
01211                 } else {
01212                         return $matches[0];
01213                 }
01214         }
01215 
01226         function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
01227                 if ( $start != 0 ) {
01228                         $start = $this->position( $text, $start, 1 );
01229                 }
01230                 if ( $end >= strlen( $text ) ) {
01231                         $end = strlen( $text );
01232                 } else {
01233                         $end = $this->position( $text, $end );
01234                 }
01235 
01236                 if ( !is_null( $posStart ) ) {
01237                         $posStart = $start;
01238                 }
01239                 if ( !is_null( $posEnd ) ) {
01240                         $posEnd = $end;
01241                 }
01242 
01243                 if ( $end > $start ) {
01244                         return substr( $text, $start, $end - $start );
01245                 } else {
01246                         return '';
01247                 }
01248         }
01249 
01258         function position( $text, $point, $offset = 0 ) {
01259                 $tolerance = 10;
01260                 $s = max( 0, $point - $tolerance );
01261                 $l = min( strlen( $text ), $point + $tolerance ) - $s;
01262                 $m = array();
01263                 if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) {
01264                         return $m[0][1] + $s + $offset;
01265                 } else {
01266                         // check if point is on a valid first UTF8 char
01267                         $char = ord( $text[$point] );
01268                         while ( $char >= 0x80 && $char < 0xc0 ) {
01269                                 // skip trailing bytes
01270                                 $point++;
01271                                 if ( $point >= strlen( $text ) )
01272                                         return strlen( $text );
01273                                 $char = ord( $text[$point] );
01274                         }
01275                         return $point;
01276 
01277                 }
01278         }
01279 
01291         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
01292                 if ( $linesleft == 0 )
01293                         return; // nothing to do
01294                 foreach ( $extracts as $index => $line ) {
01295                         if ( array_key_exists( $index, $out ) )
01296                                 continue; // this line already highlighted
01297 
01298                         $m = array();
01299                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
01300                                 continue;
01301 
01302                         $offset = $m[0][1];
01303                         $len = strlen( $m[0][0] );
01304                         if ( $offset + $len < $contextchars )
01305                                 $begin = 0;
01306                         elseif ( $len > $contextchars )
01307                                 $begin = $offset;
01308                         else
01309                                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
01310 
01311                         $end = $begin + $contextchars;
01312 
01313                         $posBegin = $begin;
01314                         // basic snippet from this line
01315                         $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
01316                         $offsets[$index] = $posBegin;
01317                         $linesleft--;
01318                         if ( $linesleft == 0 )
01319                                 return;
01320                 }
01321         }
01322 
01328         function removeWiki( $text ) {
01329                 $fname = __METHOD__;
01330                 wfProfileIn( $fname );
01331 
01332                 // $text = preg_replace( "/'{2,5}/", "", $text );
01333                 // $text = preg_replace( "/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text );
01334                 // $text = preg_replace( "/\[\[([^]|]+)\]\]/", "\\1", $text );
01335                 // $text = preg_replace( "/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text );
01336                 // $text = preg_replace( "/\\{\\|(.*?)\\|\\}/", "", $text );
01337                 // $text = preg_replace( "/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text );
01338                 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
01339                 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
01340                 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
01341                 $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text );
01342                 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
01343                 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
01344                 $text = preg_replace( "/'''''/", "", $text );
01345                 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
01346                 $text = preg_replace( "/''/", "", $text );
01347 
01348                 wfProfileOut( $fname );
01349                 return $text;
01350         }
01351 
01358         function linkReplace( $matches ) {
01359                 $colon = strpos( $matches[1], ':' );
01360                 if ( $colon === false )
01361                         return $matches[2]; // replace with caption
01362                 global $wgContLang;
01363                 $ns = substr( $matches[1], 0, $colon );
01364                 $index = $wgContLang->getNsIndex( $ns );
01365                 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) )
01366                         return $matches[0]; // return the whole thing
01367                 else
01368                         return $matches[2];
01369 
01370         }
01371 
01382         public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
01383                 global $wgContLang;
01384                 $fname = __METHOD__;
01385 
01386                 $lines = explode( "\n", $text );
01387 
01388                 $terms = implode( '|', $terms );
01389                 $max = intval( $contextchars ) + 1;
01390                 $pat1 = "/(.*)($terms)(.{0,$max})/i";
01391 
01392                 $lineno = 0;
01393 
01394                 $extract = "";
01395                 wfProfileIn( "$fname-extract" );
01396                 foreach ( $lines as $line ) {
01397                         if ( 0 == $contextlines ) {
01398                                 break;
01399                         }
01400                         ++$lineno;
01401                         $m = array();
01402                         if ( ! preg_match( $pat1, $line, $m ) ) {
01403                                 continue;
01404                         }
01405                         --$contextlines;
01406                         // truncate function changes ... to relevant i18n message.
01407                         $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
01408 
01409                         if ( count( $m ) < 3 ) {
01410                                 $post = '';
01411                         } else {
01412                                 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
01413                         }
01414 
01415                         $found = $m[2];
01416 
01417                         $line = htmlspecialchars( $pre . $found . $post );
01418                         $pat2 = '/(' . $terms . ")/i";
01419                         $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
01420 
01421                         $extract .= "${line}\n";
01422                 }
01423                 wfProfileOut( "$fname-extract" );
01424 
01425                 return $extract;
01426         }
01427 
01428 }
01429 
01436 class SearchEngineDummy extends SearchEngine {
01437         // no-op
01438 }