MediaWiki  REL1_22
SearchEngine.php
Go to the documentation of this file.
00001 <?php
00032 class SearchEngine {
00033     var $limit = 10;
00034     var $offset = 0;
00035     var $prefix = '';
00036     var $searchTerms = array();
00037     var $namespaces = array( NS_MAIN );
00038     var $showRedirects = false;
00039 
00041     protected $features = array();
00042 
00046     protected $db;
00047 
00048     function __construct( $db = null ) {
00049         if ( $db ) {
00050             $this->db = $db;
00051         } else {
00052             $this->db = wfGetDB( DB_SLAVE );
00053         }
00054     }
00055 
00064     function searchText( $term ) {
00065         return null;
00066     }
00067 
00076     function searchTitle( $term ) {
00077         return null;
00078     }
00079 
00085     function acceptListRedirects() {
00086         wfDeprecated( __METHOD__, '1.18' );
00087         return $this->supports( 'list-redirects' );
00088     }
00089 
00095     public function supports( $feature ) {
00096         switch ( $feature ) {
00097         case 'list-redirects':
00098         case 'search-update':
00099             return true;
00100         case 'title-suffix-filter':
00101         default:
00102             return false;
00103         }
00104     }
00105 
00113     public function setFeatureData( $feature, $data ) {
00114         $this->features[$feature] = $data;
00115     }
00116 
00125     public function normalizeText( $string ) {
00126         global $wgContLang;
00127 
00128         // Some languages such as Chinese require word segmentation
00129         return $wgContLang->segmentByWord( $string );
00130     }
00131 
00136     function transformSearchTerm( $term ) {
00137         return $term;
00138     }
00139 
00147     public static function getNearMatch( $searchterm ) {
00148         $title = self::getNearMatchInternal( $searchterm );
00149 
00150         wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) );
00151         return $title;
00152     }
00153 
00161     public static function getNearMatchResultSet( $searchterm ) {
00162         return new SearchNearMatchResultSet( self::getNearMatch( $searchterm ) );
00163     }
00164 
00169     private static function getNearMatchInternal( $searchterm ) {
00170         global $wgContLang, $wgEnableSearchContributorsByIP;
00171 
00172         $allSearchTerms = array( $searchterm );
00173 
00174         if ( $wgContLang->hasVariants() ) {
00175             $allSearchTerms = array_merge( $allSearchTerms, $wgContLang->autoConvertToAllVariants( $searchterm ) );
00176         }
00177 
00178         $titleResult = null;
00179         if ( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) {
00180             return $titleResult;
00181         }
00182 
00183         foreach ( $allSearchTerms as $term ) {
00184 
00185             # Exact match? No need to look further.
00186             $title = Title::newFromText( $term );
00187             if ( is_null( $title ) ) {
00188                 return null;
00189             }
00190 
00191             # Try files if searching in the Media: namespace
00192             if ( $title->getNamespace() == NS_MEDIA ) {
00193                 $title = Title::makeTitle( NS_FILE, $title->getText() );
00194             }
00195 
00196             if ( $title->isSpecialPage() || $title->isExternal() || $title->exists() ) {
00197                 return $title;
00198             }
00199 
00200             # See if it still otherwise has content is some sane sense
00201             $page = WikiPage::factory( $title );
00202             if ( $page->hasViewableContent() ) {
00203                 return $title;
00204             }
00205 
00206             if ( !wfRunHooks( 'SearchAfterNoDirectMatch', array( $term, &$title ) ) ) {
00207                 return $title;
00208             }
00209 
00210             # Now try all lower case (i.e. first letter capitalized)
00211             $title = Title::newFromText( $wgContLang->lc( $term ) );
00212             if ( $title && $title->exists() ) {
00213                 return $title;
00214             }
00215 
00216             # Now try capitalized string
00217             $title = Title::newFromText( $wgContLang->ucwords( $term ) );
00218             if ( $title && $title->exists() ) {
00219                 return $title;
00220             }
00221 
00222             # Now try all upper case
00223             $title = Title::newFromText( $wgContLang->uc( $term ) );
00224             if ( $title && $title->exists() ) {
00225                 return $title;
00226             }
00227 
00228             # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
00229             $title = Title::newFromText( $wgContLang->ucwordbreaks( $term ) );
00230             if ( $title && $title->exists() ) {
00231                 return $title;
00232             }
00233 
00234             // Give hooks a chance at better match variants
00235             $title = null;
00236             if ( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
00237                 return $title;
00238             }
00239         }
00240 
00241         $title = Title::newFromText( $searchterm );
00242 
00243         # Entering an IP address goes to the contributions page
00244         if ( $wgEnableSearchContributorsByIP ) {
00245             if ( ( $title->getNamespace() == NS_USER && User::isIP( $title->getText() ) )
00246                 || User::isIP( trim( $searchterm ) ) ) {
00247                 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
00248             }
00249         }
00250 
00251         # Entering a user goes to the user page whether it's there or not
00252         if ( $title->getNamespace() == NS_USER ) {
00253             return $title;
00254         }
00255 
00256         # Go to images that exist even if there's no local page.
00257         # There may have been a funny upload, or it may be on a shared
00258         # file repository such as Wikimedia Commons.
00259         if ( $title->getNamespace() == NS_FILE ) {
00260             $image = wfFindFile( $title );
00261             if ( $image ) {
00262                 return $title;
00263             }
00264         }
00265 
00266         # MediaWiki namespace? Page may be "implied" if not customized.
00267         # Just return it, with caps forced as the message system likes it.
00268         if ( $title->getNamespace() == NS_MEDIAWIKI ) {
00269             return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
00270         }
00271 
00272         # Quoted term? Try without the quotes...
00273         $matches = array();
00274         if ( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
00275             return SearchEngine::getNearMatch( $matches[1] );
00276         }
00277 
00278         return null;
00279     }
00280 
00281     public static function legalSearchChars() {
00282         return "A-Za-z_'.0-9\\x80-\\xFF\\-";
00283     }
00284 
00292     function setLimitOffset( $limit, $offset = 0 ) {
00293         $this->limit = intval( $limit );
00294         $this->offset = intval( $offset );
00295     }
00296 
00303     function setNamespaces( $namespaces ) {
00304         $this->namespaces = $namespaces;
00305     }
00306 
00314     function replacePrefixes( $query ) {
00315         global $wgContLang;
00316 
00317         $parsed = $query;
00318         if ( strpos( $query, ':' ) === false ) { // nothing to do
00319             wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
00320             return $parsed;
00321         }
00322 
00323         $allkeyword = wfMessage( 'searchall' )->inContentLanguage()->text() . ":";
00324         if ( strncmp( $query, $allkeyword, strlen( $allkeyword ) ) == 0 ) {
00325             $this->namespaces = null;
00326             $parsed = substr( $query, strlen( $allkeyword ) );
00327         } elseif ( strpos( $query, ':' ) !== false ) {
00328             $prefix = substr( $query, 0, strpos( $query, ':' ) );
00329             $index = $wgContLang->getNsIndex( $prefix );
00330             if ( $index !== false ) {
00331                 $this->namespaces = array( $index );
00332                 $parsed = substr( $query, strlen( $prefix ) + 1 );
00333             }
00334         }
00335         if ( trim( $parsed ) == '' ) {
00336             $parsed = $query; // prefix was the whole query
00337         }
00338 
00339         wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
00340 
00341         return $parsed;
00342     }
00343 
00348     public static function searchableNamespaces() {
00349         global $wgContLang;
00350         $arr = array();
00351         foreach ( $wgContLang->getNamespaces() as $ns => $name ) {
00352             if ( $ns >= NS_MAIN ) {
00353                 $arr[$ns] = $name;
00354             }
00355         }
00356 
00357         wfRunHooks( 'SearchableNamespaces', array( &$arr ) );
00358         return $arr;
00359     }
00360 
00368     public static function userNamespaces( $user ) {
00369         global $wgSearchEverythingOnlyLoggedIn;
00370 
00371         $searchableNamespaces = SearchEngine::searchableNamespaces();
00372 
00373         // get search everything preference, that can be set to be read for logged-in users
00374         // it overrides other options
00375         if ( !$wgSearchEverythingOnlyLoggedIn || $user->isLoggedIn() ) {
00376             if ( $user->getOption( 'searcheverything' ) ) {
00377                 return array_keys( $searchableNamespaces );
00378             }
00379         }
00380 
00381         $arr = array();
00382         foreach ( $searchableNamespaces as $ns => $name ) {
00383             if ( $user->getOption( 'searchNs' . $ns ) ) {
00384                 $arr[] = $ns;
00385             }
00386         }
00387 
00388         return $arr;
00389     }
00390 
00396     public static function userHighlightPrefs() {
00397         $contextlines = 2; // Hardcode this. Old defaults sucked. :)
00398         $contextchars = 75; // same as above.... :P
00399         return array( $contextlines, $contextchars );
00400     }
00401 
00407     public static function defaultNamespaces() {
00408         global $wgNamespacesToBeSearchedDefault;
00409 
00410         return array_keys( $wgNamespacesToBeSearchedDefault, true );
00411     }
00412 
00420     public static function namespacesAsText( $namespaces ) {
00421         global $wgContLang;
00422 
00423         $formatted = array_map( array( $wgContLang, 'getFormattedNsText' ), $namespaces );
00424         foreach ( $formatted as $key => $ns ) {
00425             if ( empty( $ns ) ) {
00426                 $formatted[$key] = wfMessage( 'blanknamespace' )->text();
00427             }
00428         }
00429         return $formatted;
00430     }
00431 
00437     public static function helpNamespaces() {
00438         global $wgNamespacesToBeSearchedHelp;
00439 
00440         return array_keys( $wgNamespacesToBeSearchedHelp, true );
00441     }
00442 
00449     function filter( $text ) {
00450         $lc = $this->legalSearchChars();
00451         return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
00452     }
00460     public static function create( $type = null ) {
00461         global $wgSearchType;
00462         $dbr = null;
00463 
00464         $alternatives = self::getSearchTypes();
00465 
00466         if ( $type && in_array( $type, $alternatives ) ) {
00467             $class = $type;
00468         } elseif ( $wgSearchType !== null ) {
00469             $class = $wgSearchType;
00470         } else {
00471             $dbr = wfGetDB( DB_SLAVE );
00472             $class = $dbr->getSearchEngine();
00473         }
00474 
00475         $search = new $class( $dbr );
00476         $search->setLimitOffset( 0, 0 );
00477         return $search;
00478     }
00479 
00486     public static function getSearchTypes() {
00487         global $wgSearchType, $wgSearchTypeAlternatives;
00488         static $alternatives = null;
00489         if ( $alternatives === null ) {
00490             $alternatives = $wgSearchTypeAlternatives ?: array();
00491             array_unshift( $alternatives, $wgSearchType );
00492         }
00493         return $alternatives;
00494     }
00495 
00505     function update( $id, $title, $text ) {
00506         // no-op
00507     }
00508 
00517     function updateTitle( $id, $title ) {
00518         // no-op
00519     }
00520 
00529     function delete( $id, $title ) {
00530         // no-op
00531     }
00532 
00538     public static function getOpenSearchTemplate() {
00539         global $wgOpenSearchTemplate, $wgCanonicalServer;
00540         if ( $wgOpenSearchTemplate ) {
00541             return $wgOpenSearchTemplate;
00542         } else {
00543             $ns = implode( '|', SearchEngine::defaultNamespaces() );
00544             if ( !$ns ) {
00545                 $ns = "0";
00546             }
00547             return $wgCanonicalServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace=' . $ns;
00548         }
00549     }
00550 
00561     public function getTextFromContent( Title $t, Content $c = null ) {
00562         return $c ? $c->getTextForSearchIndex() : '';
00563     }
00564 
00572     public function textAlreadyUpdatedForIndex() {
00573         return false;
00574     }
00575 }
00576 
00580 class SearchResultSet {
00588     function termMatches() {
00589         return array();
00590     }
00591 
00592     function numRows() {
00593         return 0;
00594     }
00595 
00602     function hasResults() {
00603         return false;
00604     }
00605 
00616     function getTotalHits() {
00617         return null;
00618     }
00619 
00626     function hasSuggestion() {
00627         return false;
00628     }
00629 
00633     function getSuggestionQuery() {
00634         return null;
00635     }
00636 
00640     function getSuggestionSnippet() {
00641         return '';
00642     }
00643 
00650     function getInfo() {
00651         return null;
00652     }
00653 
00659     function getInterwikiResults() {
00660         return null;
00661     }
00662 
00668     function hasInterwikiResults() {
00669         return $this->getInterwikiResults() != null;
00670     }
00671 
00678     function next() {
00679         return false;
00680     }
00681 
00685     function free() {
00686         // ...
00687     }
00688 }
00689 
00693 class SqlSearchResultSet extends SearchResultSet {
00694 
00695     protected $mResultSet;
00696 
00697     function __construct( $resultSet, $terms ) {
00698         $this->mResultSet = $resultSet;
00699         $this->mTerms = $terms;
00700     }
00701 
00702     function termMatches() {
00703         return $this->mTerms;
00704     }
00705 
00706     function numRows() {
00707         if ( $this->mResultSet === false ) {
00708             return false;
00709         }
00710 
00711         return $this->mResultSet->numRows();
00712     }
00713 
00714     function next() {
00715         if ( $this->mResultSet === false ) {
00716             return false;
00717         }
00718 
00719         $row = $this->mResultSet->fetchObject();
00720         if ( $row === false ) {
00721             return false;
00722         }
00723 
00724         return SearchResult::newFromRow( $row );
00725     }
00726 
00727     function free() {
00728         if ( $this->mResultSet === false ) {
00729             return false;
00730         }
00731 
00732         $this->mResultSet->free();
00733     }
00734 }
00735 
00739 class SearchResultTooMany {
00740     # # Some search engines may bail out if too many matches are found
00741 }
00742 
00749 class SearchResult {
00750 
00754     var $mRevision = null;
00755     var $mImage = null;
00756 
00760     var $mTitle;
00761 
00765     var $mText;
00766 
00773     public static function newFromTitle( $title ) {
00774         $result = new self();
00775         $result->initFromTitle( $title );
00776         return $result;
00777     }
00784     public static function newFromRow( $row ) {
00785         $result = new self();
00786         $result->initFromRow( $row );
00787         return $result;
00788     }
00789 
00790     public function __construct( $row = null ) {
00791         if ( !is_null( $row ) ) {
00792             // Backwards compatibility with pre-1.17 callers
00793             $this->initFromRow( $row );
00794         }
00795     }
00796 
00803     protected function initFromRow( $row ) {
00804         $this->initFromTitle( Title::makeTitle( $row->page_namespace, $row->page_title ) );
00805     }
00806 
00813     protected function initFromTitle( $title ) {
00814         $this->mTitle = $title;
00815         if ( !is_null( $this->mTitle ) ) {
00816             $id = false;
00817             wfRunHooks( 'SearchResultInitFromTitle', array( $title, &$id ) );
00818             $this->mRevision = Revision::newFromTitle(
00819                 $this->mTitle, $id, Revision::READ_NORMAL );
00820             if ( $this->mTitle->getNamespace() === NS_FILE ) {
00821                 $this->mImage = wfFindFile( $this->mTitle );
00822             }
00823         }
00824     }
00825 
00831     function isBrokenTitle() {
00832         if ( is_null( $this->mTitle ) ) {
00833             return true;
00834         }
00835         return false;
00836     }
00837 
00843     function isMissingRevision() {
00844         return !$this->mRevision && !$this->mImage;
00845     }
00846 
00850     function getTitle() {
00851         return $this->mTitle;
00852     }
00853 
00857     function getScore() {
00858         return null;
00859     }
00860 
00864     protected function initText() {
00865         if ( !isset( $this->mText ) ) {
00866             if ( $this->mRevision != null ) {
00867                 $this->mText = SearchEngine::create()
00868                     ->getTextFromContent( $this->mTitle, $this->mRevision->getContent() );
00869             } else { // TODO: can we fetch raw wikitext for commons images?
00870                 $this->mText = '';
00871             }
00872         }
00873     }
00874 
00879     function getTextSnippet( $terms ) {
00880         global $wgAdvancedSearchHighlighting;
00881         $this->initText();
00882 
00883         // TODO: make highliter take a content object. Make ContentHandler a factory for SearchHighliter.
00884         list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs();
00885         $h = new SearchHighlighter();
00886         if ( $wgAdvancedSearchHighlighting ) {
00887             return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
00888         } else {
00889             return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
00890         }
00891     }
00892 
00897     function getTitleSnippet( $terms ) {
00898         return '';
00899     }
00900 
00905     function getRedirectSnippet( $terms ) {
00906         return '';
00907     }
00908 
00912     function getRedirectTitle() {
00913         return null;
00914     }
00915 
00919     function getSectionSnippet() {
00920         return '';
00921     }
00922 
00926     function getSectionTitle() {
00927         return null;
00928     }
00929 
00933     function getTimestamp() {
00934         if ( $this->mRevision ) {
00935             return $this->mRevision->getTimestamp();
00936         } elseif ( $this->mImage ) {
00937             return $this->mImage->getTimestamp();
00938         }
00939         return '';
00940     }
00941 
00945     function getWordCount() {
00946         $this->initText();
00947         return str_word_count( $this->mText );
00948     }
00949 
00953     function getByteSize() {
00954         $this->initText();
00955         return strlen( $this->mText );
00956     }
00957 
00961     function hasRelated() {
00962         return false;
00963     }
00964 
00968     function getInterwikiPrefix() {
00969         return '';
00970     }
00971 }
00975 class SearchNearMatchResultSet extends SearchResultSet {
00976     private $fetched = false;
00980     public function __construct( $match ) {
00981         $this->result = $match;
00982     }
00983     public function hasResult() {
00984         return (bool)$this->result;
00985     }
00986     public function numRows() {
00987         return $this->hasResults() ? 1 : 0;
00988     }
00989     public function next() {
00990         if ( $this->fetched || !$this->result ) {
00991             return false;
00992         }
00993         $this->fetched = true;
00994         return SearchResult::newFromTitle( $this->result );
00995     }
00996 }
00997 
01003 class SearchHighlighter {
01004     var $mCleanWikitext = true;
01005 
01006     function __construct( $cleanupWikitext = true ) {
01007         $this->mCleanWikitext = $cleanupWikitext;
01008     }
01009 
01019     public function highlightText( $text, $terms, $contextlines, $contextchars ) {
01020         global $wgContLang;
01021         global $wgSearchHighlightBoundaries;
01022         $fname = __METHOD__;
01023 
01024         if ( $text == '' ) {
01025             return '';
01026         }
01027 
01028         // spli text into text + templates/links/tables
01029         $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
01030         // first capture group is for detecting nested templates/links/tables/references
01031         $endPatterns = array(
01032             1 => '/(\{\{)|(\}\})/', // template
01033             2 => '/(\[\[)|(\]\])/', // image
01034             3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
01035 
01036         // @todo FIXME: This should prolly be a hook or something
01037         if ( function_exists( 'wfCite' ) ) {
01038             $spat .= '|(<ref>)'; // references via cite extension
01039             $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
01040         }
01041         $spat .= '/';
01042         $textExt = array(); // text extracts
01043         $otherExt = array(); // other extracts
01044         wfProfileIn( "$fname-split" );
01045         $start = 0;
01046         $textLen = strlen( $text );
01047         $count = 0; // sequence number to maintain ordering
01048         while ( $start < $textLen ) {
01049             // find start of template/image/table
01050             if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
01051                 $epat = '';
01052                 foreach ( $matches as $key => $val ) {
01053                     if ( $key > 0 && $val[1] != - 1 ) {
01054                         if ( $key == 2 ) {
01055                             // see if this is an image link
01056                             $ns = substr( $val[0], 2, - 1 );
01057                             if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
01058                                 break;
01059                             }
01060 
01061                         }
01062                         $epat = $endPatterns[$key];
01063                         $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
01064                         $start = $val[1];
01065                         break;
01066                     }
01067                 }
01068                 if ( $epat ) {
01069                     // find end (and detect any nested elements)
01070                     $level = 0;
01071                     $offset = $start + 1;
01072                     $found = false;
01073                     while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
01074                         if ( array_key_exists( 2, $endMatches ) ) {
01075                             // found end
01076                             if ( $level == 0 ) {
01077                                 $len = strlen( $endMatches[2][0] );
01078                                 $off = $endMatches[2][1];
01079                                 $this->splitAndAdd( $otherExt, $count,
01080                                     substr( $text, $start, $off + $len - $start ) );
01081                                 $start = $off + $len;
01082                                 $found = true;
01083                                 break;
01084                             } else {
01085                                 // end of nested element
01086                                 $level -= 1;
01087                             }
01088                         } else {
01089                             // nested
01090                             $level += 1;
01091                         }
01092                         $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
01093                     }
01094                     if ( ! $found ) {
01095                         // couldn't find appropriate closing tag, skip
01096                         $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
01097                         $start += strlen( $matches[0][0] );
01098                     }
01099                     continue;
01100                 }
01101             }
01102             // else: add as text extract
01103             $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
01104             break;
01105         }
01106 
01107         $all = $textExt + $otherExt; // these have disjunct key sets
01108 
01109         wfProfileOut( "$fname-split" );
01110 
01111         // prepare regexps
01112         foreach ( $terms as $index => $term ) {
01113             // manually do upper/lowercase stuff for utf-8 since PHP won't do it
01114             if ( preg_match( '/[\x80-\xff]/', $term ) ) {
01115                 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] );
01116             } else {
01117                 $terms[$index] = $term;
01118             }
01119         }
01120         $anyterm = implode( '|', $terms );
01121         $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
01122 
01123         // @todo FIXME: A hack to scale contextchars, a correct solution
01124         // would be to have contextchars actually be char and not byte
01125         // length, and do proper utf-8 substrings and lengths everywhere,
01126         // but PHP is making that very hard and unclean to implement :(
01127         $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
01128         $contextchars = intval( $contextchars * $scale );
01129 
01130         $patPre = "(^|$wgSearchHighlightBoundaries)";
01131         $patPost = "($wgSearchHighlightBoundaries|$)";
01132 
01133         $pat1 = "/(" . $phrase . ")/ui";
01134         $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
01135 
01136         wfProfileIn( "$fname-extract" );
01137 
01138         $left = $contextlines;
01139 
01140         $snippets = array();
01141         $offsets = array();
01142 
01143         // show beginning only if it contains all words
01144         $first = 0;
01145         $firstText = '';
01146         foreach ( $textExt as $index => $line ) {
01147             if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
01148                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
01149                 $first = $index;
01150                 break;
01151             }
01152         }
01153         if ( $firstText ) {
01154             $succ = true;
01155             // check if first text contains all terms
01156             foreach ( $terms as $term ) {
01157                 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
01158                     $succ = false;
01159                     break;
01160                 }
01161             }
01162             if ( $succ ) {
01163                 $snippets[$first] = $firstText;
01164                 $offsets[$first] = 0;
01165             }
01166         }
01167         if ( ! $snippets ) {
01168             // match whole query on text
01169             $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
01170             // match whole query on templates/tables/images
01171             $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
01172             // match any words on text
01173             $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
01174             // match any words on templates/tables/images
01175             $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
01176 
01177             ksort( $snippets );
01178         }
01179 
01180         // add extra chars to each snippet to make snippets constant size
01181         $extended = array();
01182         if ( count( $snippets ) == 0 ) {
01183             // couldn't find the target words, just show beginning of article
01184             if ( array_key_exists( $first, $all ) ) {
01185                 $targetchars = $contextchars * $contextlines;
01186                 $snippets[$first] = '';
01187                 $offsets[$first] = 0;
01188             }
01189         } else {
01190             // if begin of the article contains the whole phrase, show only that !!
01191             if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
01192                 && $offsets[$first] < $contextchars * 2 ) {
01193                 $snippets = array( $first => $snippets[$first] );
01194             }
01195 
01196             // calc by how much to extend existing snippets
01197             $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
01198         }
01199 
01200         foreach ( $snippets as $index => $line ) {
01201             $extended[$index] = $line;
01202             $len = strlen( $line );
01203             if ( $len < $targetchars - 20 ) {
01204                 // complete this line
01205                 if ( $len < strlen( $all[$index] ) ) {
01206                     $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] );
01207                     $len = strlen( $extended[$index] );
01208                 }
01209 
01210                 // add more lines
01211                 $add = $index + 1;
01212                 while ( $len < $targetchars - 20
01213                         && array_key_exists( $add, $all )
01214                         && !array_key_exists( $add, $snippets ) ) {
01215                     $offsets[$add] = 0;
01216                     $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
01217                     $extended[$add] = $tt;
01218                     $len += strlen( $tt );
01219                     $add++;
01220                 }
01221             }
01222         }
01223 
01224         // $snippets = array_map( 'htmlspecialchars', $extended );
01225         $snippets = $extended;
01226         $last = - 1;
01227         $extract = '';
01228         foreach ( $snippets as $index => $line ) {
01229             if ( $last == - 1 ) {
01230                 $extract .= $line; // first line
01231             } elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) ) {
01232                 $extract .= " " . $line; // continous lines
01233             } else {
01234                 $extract .= '<b> ... </b>' . $line;
01235             }
01236 
01237             $last = $index;
01238         }
01239         if ( $extract ) {
01240             $extract .= '<b> ... </b>';
01241         }
01242 
01243         $processed = array();
01244         foreach ( $terms as $term ) {
01245             if ( ! isset( $processed[$term] ) ) {
01246                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
01247                 $extract = preg_replace( $pat3,
01248                     "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
01249                 $processed[$term] = true;
01250             }
01251         }
01252 
01253         wfProfileOut( "$fname-extract" );
01254 
01255         return $extract;
01256     }
01257 
01265     function splitAndAdd( &$extracts, &$count, $text ) {
01266         $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
01267         foreach ( $split as $line ) {
01268             $tt = trim( $line );
01269             if ( $tt ) {
01270                 $extracts[$count++] = $tt;
01271             }
01272         }
01273     }
01274 
01281     function caseCallback( $matches ) {
01282         global $wgContLang;
01283         if ( strlen( $matches[0] ) > 1 ) {
01284             return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
01285         } else {
01286             return $matches[0];
01287         }
01288     }
01289 
01300     function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
01301         if ( $start != 0 ) {
01302             $start = $this->position( $text, $start, 1 );
01303         }
01304         if ( $end >= strlen( $text ) ) {
01305             $end = strlen( $text );
01306         } else {
01307             $end = $this->position( $text, $end );
01308         }
01309 
01310         if ( !is_null( $posStart ) ) {
01311             $posStart = $start;
01312         }
01313         if ( !is_null( $posEnd ) ) {
01314             $posEnd = $end;
01315         }
01316 
01317         if ( $end > $start ) {
01318             return substr( $text, $start, $end - $start );
01319         } else {
01320             return '';
01321         }
01322     }
01323 
01332     function position( $text, $point, $offset = 0 ) {
01333         $tolerance = 10;
01334         $s = max( 0, $point - $tolerance );
01335         $l = min( strlen( $text ), $point + $tolerance ) - $s;
01336         $m = array();
01337         if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) {
01338             return $m[0][1] + $s + $offset;
01339         } else {
01340             // check if point is on a valid first UTF8 char
01341             $char = ord( $text[$point] );
01342             while ( $char >= 0x80 && $char < 0xc0 ) {
01343                 // skip trailing bytes
01344                 $point++;
01345                 if ( $point >= strlen( $text ) ) {
01346                     return strlen( $text );
01347                 }
01348                 $char = ord( $text[$point] );
01349             }
01350             return $point;
01351 
01352         }
01353     }
01354 
01366     function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
01367         if ( $linesleft == 0 ) {
01368             return; // nothing to do
01369         }
01370         foreach ( $extracts as $index => $line ) {
01371             if ( array_key_exists( $index, $out ) ) {
01372                 continue; // this line already highlighted
01373             }
01374 
01375             $m = array();
01376             if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
01377                 continue;
01378             }
01379 
01380             $offset = $m[0][1];
01381             $len = strlen( $m[0][0] );
01382             if ( $offset + $len < $contextchars ) {
01383                 $begin = 0;
01384             } elseif ( $len > $contextchars ) {
01385                 $begin = $offset;
01386             } else {
01387                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
01388             }
01389 
01390             $end = $begin + $contextchars;
01391 
01392             $posBegin = $begin;
01393             // basic snippet from this line
01394             $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
01395             $offsets[$index] = $posBegin;
01396             $linesleft--;
01397             if ( $linesleft == 0 ) {
01398                 return;
01399             }
01400         }
01401     }
01402 
01408     function removeWiki( $text ) {
01409         $fname = __METHOD__;
01410         wfProfileIn( $fname );
01411 
01412         // $text = preg_replace( "/'{2,5}/", "", $text );
01413         // $text = preg_replace( "/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text );
01414         // $text = preg_replace( "/\[\[([^]|]+)\]\]/", "\\1", $text );
01415         // $text = preg_replace( "/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text );
01416         // $text = preg_replace( "/\\{\\|(.*?)\\|\\}/", "", $text );
01417         // $text = preg_replace( "/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text );
01418         $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
01419         $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
01420         $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
01421         $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text );
01422         // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
01423         $text = preg_replace( "/<\/?[^>]+>/", "", $text );
01424         $text = preg_replace( "/'''''/", "", $text );
01425         $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
01426         $text = preg_replace( "/''/", "", $text );
01427 
01428         wfProfileOut( $fname );
01429         return $text;
01430     }
01431 
01438     function linkReplace( $matches ) {
01439         $colon = strpos( $matches[1], ':' );
01440         if ( $colon === false ) {
01441             return $matches[2]; // replace with caption
01442         }
01443         global $wgContLang;
01444         $ns = substr( $matches[1], 0, $colon );
01445         $index = $wgContLang->getNsIndex( $ns );
01446         if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
01447             return $matches[0]; // return the whole thing
01448         } else {
01449             return $matches[2];
01450         }
01451     }
01452 
01463     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
01464         global $wgContLang;
01465         $fname = __METHOD__;
01466 
01467         $lines = explode( "\n", $text );
01468 
01469         $terms = implode( '|', $terms );
01470         $max = intval( $contextchars ) + 1;
01471         $pat1 = "/(.*)($terms)(.{0,$max})/i";
01472 
01473         $lineno = 0;
01474 
01475         $extract = "";
01476         wfProfileIn( "$fname-extract" );
01477         foreach ( $lines as $line ) {
01478             if ( 0 == $contextlines ) {
01479                 break;
01480             }
01481             ++$lineno;
01482             $m = array();
01483             if ( ! preg_match( $pat1, $line, $m ) ) {
01484                 continue;
01485             }
01486             --$contextlines;
01487             // truncate function changes ... to relevant i18n message.
01488             $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
01489 
01490             if ( count( $m ) < 3 ) {
01491                 $post = '';
01492             } else {
01493                 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
01494             }
01495 
01496             $found = $m[2];
01497 
01498             $line = htmlspecialchars( $pre . $found . $post );
01499             $pat2 = '/(' . $terms . ")/i";
01500             $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
01501 
01502             $extract .= "${line}\n";
01503         }
01504         wfProfileOut( "$fname-extract" );
01505 
01506         return $extract;
01507     }
01508 
01509 }
01510 
01517 class SearchEngineDummy extends SearchEngine {
01518     // no-op
01519 }