MediaWiki  REL1_23
SearchHighlighter.php
Go to the documentation of this file.
00001 <?php
00029 class SearchHighlighter {
00030     var $mCleanWikitext = true;
00031 
00032     function __construct( $cleanupWikitext = true ) {
00033         $this->mCleanWikitext = $cleanupWikitext;
00034     }
00035 
00045     public function highlightText( $text, $terms, $contextlines, $contextchars ) {
00046         global $wgContLang;
00047         global $wgSearchHighlightBoundaries;
00048         $fname = __METHOD__;
00049 
00050         if ( $text == '' ) {
00051             return '';
00052         }
00053 
00054         // spli text into text + templates/links/tables
00055         $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
00056         // first capture group is for detecting nested templates/links/tables/references
00057         $endPatterns = array(
00058             1 => '/(\{\{)|(\}\})/', // template
00059             2 => '/(\[\[)|(\]\])/', // image
00060             3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
00061 
00062         // @todo FIXME: This should prolly be a hook or something
00063         if ( function_exists( 'wfCite' ) ) {
00064             $spat .= '|(<ref>)'; // references via cite extension
00065             $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
00066         }
00067         $spat .= '/';
00068         $textExt = array(); // text extracts
00069         $otherExt = array(); // other extracts
00070         wfProfileIn( "$fname-split" );
00071         $start = 0;
00072         $textLen = strlen( $text );
00073         $count = 0; // sequence number to maintain ordering
00074         while ( $start < $textLen ) {
00075             // find start of template/image/table
00076             if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
00077                 $epat = '';
00078                 foreach ( $matches as $key => $val ) {
00079                     if ( $key > 0 && $val[1] != - 1 ) {
00080                         if ( $key == 2 ) {
00081                             // see if this is an image link
00082                             $ns = substr( $val[0], 2, - 1 );
00083                             if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
00084                                 break;
00085                             }
00086 
00087                         }
00088                         $epat = $endPatterns[$key];
00089                         $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
00090                         $start = $val[1];
00091                         break;
00092                     }
00093                 }
00094                 if ( $epat ) {
00095                     // find end (and detect any nested elements)
00096                     $level = 0;
00097                     $offset = $start + 1;
00098                     $found = false;
00099                     while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
00100                         if ( array_key_exists( 2, $endMatches ) ) {
00101                             // found end
00102                             if ( $level == 0 ) {
00103                                 $len = strlen( $endMatches[2][0] );
00104                                 $off = $endMatches[2][1];
00105                                 $this->splitAndAdd( $otherExt, $count,
00106                                     substr( $text, $start, $off + $len - $start ) );
00107                                 $start = $off + $len;
00108                                 $found = true;
00109                                 break;
00110                             } else {
00111                                 // end of nested element
00112                                 $level -= 1;
00113                             }
00114                         } else {
00115                             // nested
00116                             $level += 1;
00117                         }
00118                         $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
00119                     }
00120                     if ( ! $found ) {
00121                         // couldn't find appropriate closing tag, skip
00122                         $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
00123                         $start += strlen( $matches[0][0] );
00124                     }
00125                     continue;
00126                 }
00127             }
00128             // else: add as text extract
00129             $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
00130             break;
00131         }
00132 
00133         $all = $textExt + $otherExt; // these have disjunct key sets
00134 
00135         wfProfileOut( "$fname-split" );
00136 
00137         // prepare regexps
00138         foreach ( $terms as $index => $term ) {
00139             // manually do upper/lowercase stuff for utf-8 since PHP won't do it
00140             if ( preg_match( '/[\x80-\xff]/', $term ) ) {
00141                 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] );
00142             } else {
00143                 $terms[$index] = $term;
00144             }
00145         }
00146         $anyterm = implode( '|', $terms );
00147         $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
00148 
00149         // @todo FIXME: A hack to scale contextchars, a correct solution
00150         // would be to have contextchars actually be char and not byte
00151         // length, and do proper utf-8 substrings and lengths everywhere,
00152         // but PHP is making that very hard and unclean to implement :(
00153         $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
00154         $contextchars = intval( $contextchars * $scale );
00155 
00156         $patPre = "(^|$wgSearchHighlightBoundaries)";
00157         $patPost = "($wgSearchHighlightBoundaries|$)";
00158 
00159         $pat1 = "/(" . $phrase . ")/ui";
00160         $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
00161 
00162         wfProfileIn( "$fname-extract" );
00163 
00164         $left = $contextlines;
00165 
00166         $snippets = array();
00167         $offsets = array();
00168 
00169         // show beginning only if it contains all words
00170         $first = 0;
00171         $firstText = '';
00172         foreach ( $textExt as $index => $line ) {
00173             if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
00174                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
00175                 $first = $index;
00176                 break;
00177             }
00178         }
00179         if ( $firstText ) {
00180             $succ = true;
00181             // check if first text contains all terms
00182             foreach ( $terms as $term ) {
00183                 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
00184                     $succ = false;
00185                     break;
00186                 }
00187             }
00188             if ( $succ ) {
00189                 $snippets[$first] = $firstText;
00190                 $offsets[$first] = 0;
00191             }
00192         }
00193         if ( ! $snippets ) {
00194             // match whole query on text
00195             $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
00196             // match whole query on templates/tables/images
00197             $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
00198             // match any words on text
00199             $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
00200             // match any words on templates/tables/images
00201             $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
00202 
00203             ksort( $snippets );
00204         }
00205 
00206         // add extra chars to each snippet to make snippets constant size
00207         $extended = array();
00208         if ( count( $snippets ) == 0 ) {
00209             // couldn't find the target words, just show beginning of article
00210             if ( array_key_exists( $first, $all ) ) {
00211                 $targetchars = $contextchars * $contextlines;
00212                 $snippets[$first] = '';
00213                 $offsets[$first] = 0;
00214             }
00215         } else {
00216             // if begin of the article contains the whole phrase, show only that !!
00217             if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
00218                 && $offsets[$first] < $contextchars * 2 ) {
00219                 $snippets = array( $first => $snippets[$first] );
00220             }
00221 
00222             // calc by how much to extend existing snippets
00223             $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
00224         }
00225 
00226         foreach ( $snippets as $index => $line ) {
00227             $extended[$index] = $line;
00228             $len = strlen( $line );
00229             if ( $len < $targetchars - 20 ) {
00230                 // complete this line
00231                 if ( $len < strlen( $all[$index] ) ) {
00232                     $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] );
00233                     $len = strlen( $extended[$index] );
00234                 }
00235 
00236                 // add more lines
00237                 $add = $index + 1;
00238                 while ( $len < $targetchars - 20
00239                         && array_key_exists( $add, $all )
00240                         && !array_key_exists( $add, $snippets ) ) {
00241                     $offsets[$add] = 0;
00242                     $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
00243                     $extended[$add] = $tt;
00244                     $len += strlen( $tt );
00245                     $add++;
00246                 }
00247             }
00248         }
00249 
00250         // $snippets = array_map( 'htmlspecialchars', $extended );
00251         $snippets = $extended;
00252         $last = - 1;
00253         $extract = '';
00254         foreach ( $snippets as $index => $line ) {
00255             if ( $last == - 1 ) {
00256                 $extract .= $line; // first line
00257             } elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) ) {
00258                 $extract .= " " . $line; // continous lines
00259             } else {
00260                 $extract .= '<b> ... </b>' . $line;
00261             }
00262 
00263             $last = $index;
00264         }
00265         if ( $extract ) {
00266             $extract .= '<b> ... </b>';
00267         }
00268 
00269         $processed = array();
00270         foreach ( $terms as $term ) {
00271             if ( ! isset( $processed[$term] ) ) {
00272                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
00273                 $extract = preg_replace( $pat3,
00274                     "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
00275                 $processed[$term] = true;
00276             }
00277         }
00278 
00279         wfProfileOut( "$fname-extract" );
00280 
00281         return $extract;
00282     }
00283 
00291     function splitAndAdd( &$extracts, &$count, $text ) {
00292         $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
00293         foreach ( $split as $line ) {
00294             $tt = trim( $line );
00295             if ( $tt ) {
00296                 $extracts[$count++] = $tt;
00297             }
00298         }
00299     }
00300 
00307     function caseCallback( $matches ) {
00308         global $wgContLang;
00309         if ( strlen( $matches[0] ) > 1 ) {
00310             return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
00311         } else {
00312             return $matches[0];
00313         }
00314     }
00315 
00326     function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
00327         if ( $start != 0 ) {
00328             $start = $this->position( $text, $start, 1 );
00329         }
00330         if ( $end >= strlen( $text ) ) {
00331             $end = strlen( $text );
00332         } else {
00333             $end = $this->position( $text, $end );
00334         }
00335 
00336         if ( !is_null( $posStart ) ) {
00337             $posStart = $start;
00338         }
00339         if ( !is_null( $posEnd ) ) {
00340             $posEnd = $end;
00341         }
00342 
00343         if ( $end > $start ) {
00344             return substr( $text, $start, $end - $start );
00345         } else {
00346             return '';
00347         }
00348     }
00349 
00358     function position( $text, $point, $offset = 0 ) {
00359         $tolerance = 10;
00360         $s = max( 0, $point - $tolerance );
00361         $l = min( strlen( $text ), $point + $tolerance ) - $s;
00362         $m = array();
00363         if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) {
00364             return $m[0][1] + $s + $offset;
00365         } else {
00366             // check if point is on a valid first UTF8 char
00367             $char = ord( $text[$point] );
00368             while ( $char >= 0x80 && $char < 0xc0 ) {
00369                 // skip trailing bytes
00370                 $point++;
00371                 if ( $point >= strlen( $text ) ) {
00372                     return strlen( $text );
00373                 }
00374                 $char = ord( $text[$point] );
00375             }
00376             return $point;
00377 
00378         }
00379     }
00380 
00392     function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
00393         if ( $linesleft == 0 ) {
00394             return; // nothing to do
00395         }
00396         foreach ( $extracts as $index => $line ) {
00397             if ( array_key_exists( $index, $out ) ) {
00398                 continue; // this line already highlighted
00399             }
00400 
00401             $m = array();
00402             if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
00403                 continue;
00404             }
00405 
00406             $offset = $m[0][1];
00407             $len = strlen( $m[0][0] );
00408             if ( $offset + $len < $contextchars ) {
00409                 $begin = 0;
00410             } elseif ( $len > $contextchars ) {
00411                 $begin = $offset;
00412             } else {
00413                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
00414             }
00415 
00416             $end = $begin + $contextchars;
00417 
00418             $posBegin = $begin;
00419             // basic snippet from this line
00420             $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
00421             $offsets[$index] = $posBegin;
00422             $linesleft--;
00423             if ( $linesleft == 0 ) {
00424                 return;
00425             }
00426         }
00427     }
00428 
00434     function removeWiki( $text ) {
00435         $fname = __METHOD__;
00436         wfProfileIn( $fname );
00437 
00438         // $text = preg_replace( "/'{2,5}/", "", $text );
00439         // $text = preg_replace( "/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text );
00440         // $text = preg_replace( "/\[\[([^]|]+)\]\]/", "\\1", $text );
00441         // $text = preg_replace( "/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text );
00442         // $text = preg_replace( "/\\{\\|(.*?)\\|\\}/", "", $text );
00443         // $text = preg_replace( "/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text );
00444         $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
00445         $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
00446         $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
00447         $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text );
00448         // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
00449         $text = preg_replace( "/<\/?[^>]+>/", "", $text );
00450         $text = preg_replace( "/'''''/", "", $text );
00451         $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
00452         $text = preg_replace( "/''/", "", $text );
00453 
00454         wfProfileOut( $fname );
00455         return $text;
00456     }
00457 
00464     function linkReplace( $matches ) {
00465         $colon = strpos( $matches[1], ':' );
00466         if ( $colon === false ) {
00467             return $matches[2]; // replace with caption
00468         }
00469         global $wgContLang;
00470         $ns = substr( $matches[1], 0, $colon );
00471         $index = $wgContLang->getNsIndex( $ns );
00472         if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
00473             return $matches[0]; // return the whole thing
00474         } else {
00475             return $matches[2];
00476         }
00477     }
00478 
00489     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
00490         global $wgContLang;
00491         $fname = __METHOD__;
00492 
00493         $lines = explode( "\n", $text );
00494 
00495         $terms = implode( '|', $terms );
00496         $max = intval( $contextchars ) + 1;
00497         $pat1 = "/(.*)($terms)(.{0,$max})/i";
00498 
00499         $lineno = 0;
00500 
00501         $extract = "";
00502         wfProfileIn( "$fname-extract" );
00503         foreach ( $lines as $line ) {
00504             if ( 0 == $contextlines ) {
00505                 break;
00506             }
00507             ++$lineno;
00508             $m = array();
00509             if ( ! preg_match( $pat1, $line, $m ) ) {
00510                 continue;
00511             }
00512             --$contextlines;
00513             // truncate function changes ... to relevant i18n message.
00514             $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
00515 
00516             if ( count( $m ) < 3 ) {
00517                 $post = '';
00518             } else {
00519                 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
00520             }
00521 
00522             $found = $m[2];
00523 
00524             $line = htmlspecialchars( $pre . $found . $post );
00525             $pat2 = '/(' . $terms . ")/i";
00526             $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
00527 
00528             $extract .= "${line}\n";
00529         }
00530         wfProfileOut( "$fname-extract" );
00531 
00532         return $extract;
00533     }
00534 }