MediaWiki  REL1_24
SearchHighlighter.php
Go to the documentation of this file.
00001 <?php
00029 class SearchHighlighter {
00030     protected $mCleanWikitext = true;
00031 
00032     function __construct( $cleanupWikitext = true ) {
00033         $this->mCleanWikitext = $cleanupWikitext;
00034     }
00035 
00045     public function highlightText( $text, $terms, $contextlines, $contextchars ) {
00046         global $wgContLang, $wgSearchHighlightBoundaries;
00047 
00048         $fname = __METHOD__;
00049 
00050         if ( $text == '' ) {
00051             return '';
00052         }
00053 
00054         // spli text into text + templates/links/tables
00055         $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
00056         // first capture group is for detecting nested templates/links/tables/references
00057         $endPatterns = array(
00058             1 => '/(\{\{)|(\}\})/', // template
00059             2 => '/(\[\[)|(\]\])/', // image
00060             3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
00061 
00062         // @todo FIXME: This should prolly be a hook or something
00063         if ( function_exists( 'wfCite' ) ) {
00064             $spat .= '|(<ref>)'; // references via cite extension
00065             $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
00066         }
00067         $spat .= '/';
00068         $textExt = array(); // text extracts
00069         $otherExt = array(); // other extracts
00070         wfProfileIn( "$fname-split" );
00071         $start = 0;
00072         $textLen = strlen( $text );
00073         $count = 0; // sequence number to maintain ordering
00074         while ( $start < $textLen ) {
00075             // find start of template/image/table
00076             if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
00077                 $epat = '';
00078                 foreach ( $matches as $key => $val ) {
00079                     if ( $key > 0 && $val[1] != - 1 ) {
00080                         if ( $key == 2 ) {
00081                             // see if this is an image link
00082                             $ns = substr( $val[0], 2, - 1 );
00083                             if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
00084                                 break;
00085                             }
00086 
00087                         }
00088                         $epat = $endPatterns[$key];
00089                         $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
00090                         $start = $val[1];
00091                         break;
00092                     }
00093                 }
00094                 if ( $epat ) {
00095                     // find end (and detect any nested elements)
00096                     $level = 0;
00097                     $offset = $start + 1;
00098                     $found = false;
00099                     while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
00100                         if ( array_key_exists( 2, $endMatches ) ) {
00101                             // found end
00102                             if ( $level == 0 ) {
00103                                 $len = strlen( $endMatches[2][0] );
00104                                 $off = $endMatches[2][1];
00105                                 $this->splitAndAdd( $otherExt, $count,
00106                                     substr( $text, $start, $off + $len - $start ) );
00107                                 $start = $off + $len;
00108                                 $found = true;
00109                                 break;
00110                             } else {
00111                                 // end of nested element
00112                                 $level -= 1;
00113                             }
00114                         } else {
00115                             // nested
00116                             $level += 1;
00117                         }
00118                         $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
00119                     }
00120                     if ( !$found ) {
00121                         // couldn't find appropriate closing tag, skip
00122                         $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
00123                         $start += strlen( $matches[0][0] );
00124                     }
00125                     continue;
00126                 }
00127             }
00128             // else: add as text extract
00129             $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
00130             break;
00131         }
00132 
00133         $all = $textExt + $otherExt; // these have disjunct key sets
00134 
00135         wfProfileOut( "$fname-split" );
00136 
00137         // prepare regexps
00138         foreach ( $terms as $index => $term ) {
00139             // manually do upper/lowercase stuff for utf-8 since PHP won't do it
00140             if ( preg_match( '/[\x80-\xff]/', $term ) ) {
00141                 $terms[$index] = preg_replace_callback(
00142                     '/./us',
00143                     array( $this, 'caseCallback' ),
00144                     $terms[$index]
00145                 );
00146             } else {
00147                 $terms[$index] = $term;
00148             }
00149         }
00150         $anyterm = implode( '|', $terms );
00151         $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
00152 
00153         // @todo FIXME: A hack to scale contextchars, a correct solution
00154         // would be to have contextchars actually be char and not byte
00155         // length, and do proper utf-8 substrings and lengths everywhere,
00156         // but PHP is making that very hard and unclean to implement :(
00157         $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
00158         $contextchars = intval( $contextchars * $scale );
00159 
00160         $patPre = "(^|$wgSearchHighlightBoundaries)";
00161         $patPost = "($wgSearchHighlightBoundaries|$)";
00162 
00163         $pat1 = "/(" . $phrase . ")/ui";
00164         $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
00165 
00166         wfProfileIn( "$fname-extract" );
00167 
00168         $left = $contextlines;
00169 
00170         $snippets = array();
00171         $offsets = array();
00172 
00173         // show beginning only if it contains all words
00174         $first = 0;
00175         $firstText = '';
00176         foreach ( $textExt as $index => $line ) {
00177             if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
00178                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
00179                 $first = $index;
00180                 break;
00181             }
00182         }
00183         if ( $firstText ) {
00184             $succ = true;
00185             // check if first text contains all terms
00186             foreach ( $terms as $term ) {
00187                 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
00188                     $succ = false;
00189                     break;
00190                 }
00191             }
00192             if ( $succ ) {
00193                 $snippets[$first] = $firstText;
00194                 $offsets[$first] = 0;
00195             }
00196         }
00197         if ( !$snippets ) {
00198             // match whole query on text
00199             $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
00200             // match whole query on templates/tables/images
00201             $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
00202             // match any words on text
00203             $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
00204             // match any words on templates/tables/images
00205             $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
00206 
00207             ksort( $snippets );
00208         }
00209 
00210         // add extra chars to each snippet to make snippets constant size
00211         $extended = array();
00212         if ( count( $snippets ) == 0 ) {
00213             // couldn't find the target words, just show beginning of article
00214             if ( array_key_exists( $first, $all ) ) {
00215                 $targetchars = $contextchars * $contextlines;
00216                 $snippets[$first] = '';
00217                 $offsets[$first] = 0;
00218             }
00219         } else {
00220             // if begin of the article contains the whole phrase, show only that !!
00221             if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
00222                 && $offsets[$first] < $contextchars * 2 ) {
00223                 $snippets = array( $first => $snippets[$first] );
00224             }
00225 
00226             // calc by how much to extend existing snippets
00227             $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
00228         }
00229 
00230         foreach ( $snippets as $index => $line ) {
00231             $extended[$index] = $line;
00232             $len = strlen( $line );
00233             if ( $len < $targetchars - 20 ) {
00234                 // complete this line
00235                 if ( $len < strlen( $all[$index] ) ) {
00236                     $extended[$index] = $this->extract(
00237                         $all[$index],
00238                         $offsets[$index],
00239                         $offsets[$index] + $targetchars,
00240                         $offsets[$index]
00241                     );
00242                     $len = strlen( $extended[$index] );
00243                 }
00244 
00245                 // add more lines
00246                 $add = $index + 1;
00247                 while ( $len < $targetchars - 20
00248                         && array_key_exists( $add, $all )
00249                         && !array_key_exists( $add, $snippets ) ) {
00250                     $offsets[$add] = 0;
00251                     $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
00252                     $extended[$add] = $tt;
00253                     $len += strlen( $tt );
00254                     $add++;
00255                 }
00256             }
00257         }
00258 
00259         // $snippets = array_map( 'htmlspecialchars', $extended );
00260         $snippets = $extended;
00261         $last = - 1;
00262         $extract = '';
00263         foreach ( $snippets as $index => $line ) {
00264             if ( $last == - 1 ) {
00265                 $extract .= $line; // first line
00266             } elseif ( $last + 1 == $index
00267                 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
00268             ) {
00269                 $extract .= " " . $line; // continous lines
00270             } else {
00271                 $extract .= '<b> ... </b>' . $line;
00272             }
00273 
00274             $last = $index;
00275         }
00276         if ( $extract ) {
00277             $extract .= '<b> ... </b>';
00278         }
00279 
00280         $processed = array();
00281         foreach ( $terms as $term ) {
00282             if ( !isset( $processed[$term] ) ) {
00283                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
00284                 $extract = preg_replace( $pat3,
00285                     "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
00286                 $processed[$term] = true;
00287             }
00288         }
00289 
00290         wfProfileOut( "$fname-extract" );
00291 
00292         return $extract;
00293     }
00294 
00302     function splitAndAdd( &$extracts, &$count, $text ) {
00303         $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
00304         foreach ( $split as $line ) {
00305             $tt = trim( $line );
00306             if ( $tt ) {
00307                 $extracts[$count++] = $tt;
00308             }
00309         }
00310     }
00311 
00318     function caseCallback( $matches ) {
00319         global $wgContLang;
00320         if ( strlen( $matches[0] ) > 1 ) {
00321             return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
00322         } else {
00323             return $matches[0];
00324         }
00325     }
00326 
00337     function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
00338         if ( $start != 0 ) {
00339             $start = $this->position( $text, $start, 1 );
00340         }
00341         if ( $end >= strlen( $text ) ) {
00342             $end = strlen( $text );
00343         } else {
00344             $end = $this->position( $text, $end );
00345         }
00346 
00347         if ( !is_null( $posStart ) ) {
00348             $posStart = $start;
00349         }
00350         if ( !is_null( $posEnd ) ) {
00351             $posEnd = $end;
00352         }
00353 
00354         if ( $end > $start ) {
00355             return substr( $text, $start, $end - $start );
00356         } else {
00357             return '';
00358         }
00359     }
00360 
00369     function position( $text, $point, $offset = 0 ) {
00370         $tolerance = 10;
00371         $s = max( 0, $point - $tolerance );
00372         $l = min( strlen( $text ), $point + $tolerance ) - $s;
00373         $m = array();
00374 
00375         if ( preg_match(
00376             '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
00377             substr( $text, $s, $l ),
00378             $m,
00379             PREG_OFFSET_CAPTURE
00380         ) ) {
00381             return $m[0][1] + $s + $offset;
00382         } else {
00383             // check if point is on a valid first UTF8 char
00384             $char = ord( $text[$point] );
00385             while ( $char >= 0x80 && $char < 0xc0 ) {
00386                 // skip trailing bytes
00387                 $point++;
00388                 if ( $point >= strlen( $text ) ) {
00389                     return strlen( $text );
00390                 }
00391                 $char = ord( $text[$point] );
00392             }
00393 
00394             return $point;
00395 
00396         }
00397     }
00398 
00410     function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
00411         if ( $linesleft == 0 ) {
00412             return; // nothing to do
00413         }
00414         foreach ( $extracts as $index => $line ) {
00415             if ( array_key_exists( $index, $out ) ) {
00416                 continue; // this line already highlighted
00417             }
00418 
00419             $m = array();
00420             if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
00421                 continue;
00422             }
00423 
00424             $offset = $m[0][1];
00425             $len = strlen( $m[0][0] );
00426             if ( $offset + $len < $contextchars ) {
00427                 $begin = 0;
00428             } elseif ( $len > $contextchars ) {
00429                 $begin = $offset;
00430             } else {
00431                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
00432             }
00433 
00434             $end = $begin + $contextchars;
00435 
00436             $posBegin = $begin;
00437             // basic snippet from this line
00438             $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
00439             $offsets[$index] = $posBegin;
00440             $linesleft--;
00441             if ( $linesleft == 0 ) {
00442                 return;
00443             }
00444         }
00445     }
00446 
00453     function removeWiki( $text ) {
00454         $fname = __METHOD__;
00455         wfProfileIn( $fname );
00456 
00457         // $text = preg_replace( "/'{2,5}/", "", $text );
00458         // $text = preg_replace( "/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text );
00459         // $text = preg_replace( "/\[\[([^]|]+)\]\]/", "\\1", $text );
00460         // $text = preg_replace( "/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text );
00461         // $text = preg_replace( "/\\{\\|(.*?)\\|\\}/", "", $text );
00462         // $text = preg_replace( "/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text );
00463         $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
00464         $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
00465         $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
00466         $text = preg_replace_callback(
00467             "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
00468             array( $this, 'linkReplace' ),
00469             $text
00470         );
00471         // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
00472         $text = preg_replace( "/<\/?[^>]+>/", "", $text );
00473         $text = preg_replace( "/'''''/", "", $text );
00474         $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
00475         $text = preg_replace( "/''/", "", $text );
00476 
00477         wfProfileOut( $fname );
00478         return $text;
00479     }
00480 
00488     function linkReplace( $matches ) {
00489         $colon = strpos( $matches[1], ':' );
00490         if ( $colon === false ) {
00491             return $matches[2]; // replace with caption
00492         }
00493         global $wgContLang;
00494         $ns = substr( $matches[1], 0, $colon );
00495         $index = $wgContLang->getNsIndex( $ns );
00496         if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
00497             return $matches[0]; // return the whole thing
00498         } else {
00499             return $matches[2];
00500         }
00501     }
00502 
00513     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
00514         global $wgContLang;
00515         $fname = __METHOD__;
00516 
00517         $lines = explode( "\n", $text );
00518 
00519         $terms = implode( '|', $terms );
00520         $max = intval( $contextchars ) + 1;
00521         $pat1 = "/(.*)($terms)(.{0,$max})/i";
00522 
00523         $lineno = 0;
00524 
00525         $extract = "";
00526         wfProfileIn( "$fname-extract" );
00527         foreach ( $lines as $line ) {
00528             if ( 0 == $contextlines ) {
00529                 break;
00530             }
00531             ++$lineno;
00532             $m = array();
00533             if ( !preg_match( $pat1, $line, $m ) ) {
00534                 continue;
00535             }
00536             --$contextlines;
00537             // truncate function changes ... to relevant i18n message.
00538             $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
00539 
00540             if ( count( $m ) < 3 ) {
00541                 $post = '';
00542             } else {
00543                 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
00544             }
00545 
00546             $found = $m[2];
00547 
00548             $line = htmlspecialchars( $pre . $found . $post );
00549             $pat2 = '/(' . $terms . ")/i";
00550             $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
00551 
00552             $extract .= "${line}\n";
00553         }
00554         wfProfileOut( "$fname-extract" );
00555 
00556         return $extract;
00557     }
00558 
00567     public function highlightNone( $text, $contextlines, $contextchars ) {
00568         $match = array();
00569         $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
00570         $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
00571         preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
00572         $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) ); // trim and limit to max number of chars
00573         return str_replace( "\n", '<br>', $text );
00574     }
00575 }