MediaWiki
REL1_24
|
00001 <?php 00029 class SearchHighlighter { 00030 protected $mCleanWikitext = true; 00031 00032 function __construct( $cleanupWikitext = true ) { 00033 $this->mCleanWikitext = $cleanupWikitext; 00034 } 00035 00045 public function highlightText( $text, $terms, $contextlines, $contextchars ) { 00046 global $wgContLang, $wgSearchHighlightBoundaries; 00047 00048 $fname = __METHOD__; 00049 00050 if ( $text == '' ) { 00051 return ''; 00052 } 00053 00054 // spli text into text + templates/links/tables 00055 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)"; 00056 // first capture group is for detecting nested templates/links/tables/references 00057 $endPatterns = array( 00058 1 => '/(\{\{)|(\}\})/', // template 00059 2 => '/(\[\[)|(\]\])/', // image 00060 3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table 00061 00062 // @todo FIXME: This should prolly be a hook or something 00063 if ( function_exists( 'wfCite' ) ) { 00064 $spat .= '|(<ref>)'; // references via cite extension 00065 $endPatterns[4] = '/(<ref>)|(<\/ref>)/'; 00066 } 00067 $spat .= '/'; 00068 $textExt = array(); // text extracts 00069 $otherExt = array(); // other extracts 00070 wfProfileIn( "$fname-split" ); 00071 $start = 0; 00072 $textLen = strlen( $text ); 00073 $count = 0; // sequence number to maintain ordering 00074 while ( $start < $textLen ) { 00075 // find start of template/image/table 00076 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) { 00077 $epat = ''; 00078 foreach ( $matches as $key => $val ) { 00079 if ( $key > 0 && $val[1] != - 1 ) { 00080 if ( $key == 2 ) { 00081 // see if this is an image link 00082 $ns = substr( $val[0], 2, - 1 ); 00083 if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) { 00084 break; 00085 } 00086 00087 } 00088 $epat = $endPatterns[$key]; 00089 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) ); 00090 $start = $val[1]; 00091 break; 00092 } 00093 } 00094 if ( $epat ) { 00095 // find end (and detect any nested elements) 00096 $level = 0; 00097 $offset = $start + 1; 00098 $found = false; 00099 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) { 00100 if ( array_key_exists( 2, $endMatches ) ) { 00101 // found end 00102 if ( $level == 0 ) { 00103 $len = strlen( $endMatches[2][0] ); 00104 $off = $endMatches[2][1]; 00105 $this->splitAndAdd( $otherExt, $count, 00106 substr( $text, $start, $off + $len - $start ) ); 00107 $start = $off + $len; 00108 $found = true; 00109 break; 00110 } else { 00111 // end of nested element 00112 $level -= 1; 00113 } 00114 } else { 00115 // nested 00116 $level += 1; 00117 } 00118 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] ); 00119 } 00120 if ( !$found ) { 00121 // couldn't find appropriate closing tag, skip 00122 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) ); 00123 $start += strlen( $matches[0][0] ); 00124 } 00125 continue; 00126 } 00127 } 00128 // else: add as text extract 00129 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) ); 00130 break; 00131 } 00132 00133 $all = $textExt + $otherExt; // these have disjunct key sets 00134 00135 wfProfileOut( "$fname-split" ); 00136 00137 // prepare regexps 00138 foreach ( $terms as $index => $term ) { 00139 // manually do upper/lowercase stuff for utf-8 since PHP won't do it 00140 if ( preg_match( '/[\x80-\xff]/', $term ) ) { 00141 $terms[$index] = preg_replace_callback( 00142 '/./us', 00143 array( $this, 'caseCallback' ), 00144 $terms[$index] 00145 ); 00146 } else { 00147 $terms[$index] = $term; 00148 } 00149 } 00150 $anyterm = implode( '|', $terms ); 00151 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms ); 00152 00153 // @todo FIXME: A hack to scale contextchars, a correct solution 00154 // would be to have contextchars actually be char and not byte 00155 // length, and do proper utf-8 substrings and lengths everywhere, 00156 // but PHP is making that very hard and unclean to implement :( 00157 $scale = strlen( $anyterm ) / mb_strlen( $anyterm ); 00158 $contextchars = intval( $contextchars * $scale ); 00159 00160 $patPre = "(^|$wgSearchHighlightBoundaries)"; 00161 $patPost = "($wgSearchHighlightBoundaries|$)"; 00162 00163 $pat1 = "/(" . $phrase . ")/ui"; 00164 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui"; 00165 00166 wfProfileIn( "$fname-extract" ); 00167 00168 $left = $contextlines; 00169 00170 $snippets = array(); 00171 $offsets = array(); 00172 00173 // show beginning only if it contains all words 00174 $first = 0; 00175 $firstText = ''; 00176 foreach ( $textExt as $index => $line ) { 00177 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) { 00178 $firstText = $this->extract( $line, 0, $contextchars * $contextlines ); 00179 $first = $index; 00180 break; 00181 } 00182 } 00183 if ( $firstText ) { 00184 $succ = true; 00185 // check if first text contains all terms 00186 foreach ( $terms as $term ) { 00187 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) { 00188 $succ = false; 00189 break; 00190 } 00191 } 00192 if ( $succ ) { 00193 $snippets[$first] = $firstText; 00194 $offsets[$first] = 0; 00195 } 00196 } 00197 if ( !$snippets ) { 00198 // match whole query on text 00199 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets ); 00200 // match whole query on templates/tables/images 00201 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets ); 00202 // match any words on text 00203 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets ); 00204 // match any words on templates/tables/images 00205 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets ); 00206 00207 ksort( $snippets ); 00208 } 00209 00210 // add extra chars to each snippet to make snippets constant size 00211 $extended = array(); 00212 if ( count( $snippets ) == 0 ) { 00213 // couldn't find the target words, just show beginning of article 00214 if ( array_key_exists( $first, $all ) ) { 00215 $targetchars = $contextchars * $contextlines; 00216 $snippets[$first] = ''; 00217 $offsets[$first] = 0; 00218 } 00219 } else { 00220 // if begin of the article contains the whole phrase, show only that !! 00221 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] ) 00222 && $offsets[$first] < $contextchars * 2 ) { 00223 $snippets = array( $first => $snippets[$first] ); 00224 } 00225 00226 // calc by how much to extend existing snippets 00227 $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) ); 00228 } 00229 00230 foreach ( $snippets as $index => $line ) { 00231 $extended[$index] = $line; 00232 $len = strlen( $line ); 00233 if ( $len < $targetchars - 20 ) { 00234 // complete this line 00235 if ( $len < strlen( $all[$index] ) ) { 00236 $extended[$index] = $this->extract( 00237 $all[$index], 00238 $offsets[$index], 00239 $offsets[$index] + $targetchars, 00240 $offsets[$index] 00241 ); 00242 $len = strlen( $extended[$index] ); 00243 } 00244 00245 // add more lines 00246 $add = $index + 1; 00247 while ( $len < $targetchars - 20 00248 && array_key_exists( $add, $all ) 00249 && !array_key_exists( $add, $snippets ) ) { 00250 $offsets[$add] = 0; 00251 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] ); 00252 $extended[$add] = $tt; 00253 $len += strlen( $tt ); 00254 $add++; 00255 } 00256 } 00257 } 00258 00259 // $snippets = array_map( 'htmlspecialchars', $extended ); 00260 $snippets = $extended; 00261 $last = - 1; 00262 $extract = ''; 00263 foreach ( $snippets as $index => $line ) { 00264 if ( $last == - 1 ) { 00265 $extract .= $line; // first line 00266 } elseif ( $last + 1 == $index 00267 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) 00268 ) { 00269 $extract .= " " . $line; // continous lines 00270 } else { 00271 $extract .= '<b> ... </b>' . $line; 00272 } 00273 00274 $last = $index; 00275 } 00276 if ( $extract ) { 00277 $extract .= '<b> ... </b>'; 00278 } 00279 00280 $processed = array(); 00281 foreach ( $terms as $term ) { 00282 if ( !isset( $processed[$term] ) ) { 00283 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word 00284 $extract = preg_replace( $pat3, 00285 "\\1<span class='searchmatch'>\\2</span>\\3", $extract ); 00286 $processed[$term] = true; 00287 } 00288 } 00289 00290 wfProfileOut( "$fname-extract" ); 00291 00292 return $extract; 00293 } 00294 00302 function splitAndAdd( &$extracts, &$count, $text ) { 00303 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text ); 00304 foreach ( $split as $line ) { 00305 $tt = trim( $line ); 00306 if ( $tt ) { 00307 $extracts[$count++] = $tt; 00308 } 00309 } 00310 } 00311 00318 function caseCallback( $matches ) { 00319 global $wgContLang; 00320 if ( strlen( $matches[0] ) > 1 ) { 00321 return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']'; 00322 } else { 00323 return $matches[0]; 00324 } 00325 } 00326 00337 function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) { 00338 if ( $start != 0 ) { 00339 $start = $this->position( $text, $start, 1 ); 00340 } 00341 if ( $end >= strlen( $text ) ) { 00342 $end = strlen( $text ); 00343 } else { 00344 $end = $this->position( $text, $end ); 00345 } 00346 00347 if ( !is_null( $posStart ) ) { 00348 $posStart = $start; 00349 } 00350 if ( !is_null( $posEnd ) ) { 00351 $posEnd = $end; 00352 } 00353 00354 if ( $end > $start ) { 00355 return substr( $text, $start, $end - $start ); 00356 } else { 00357 return ''; 00358 } 00359 } 00360 00369 function position( $text, $point, $offset = 0 ) { 00370 $tolerance = 10; 00371 $s = max( 0, $point - $tolerance ); 00372 $l = min( strlen( $text ), $point + $tolerance ) - $s; 00373 $m = array(); 00374 00375 if ( preg_match( 00376 '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', 00377 substr( $text, $s, $l ), 00378 $m, 00379 PREG_OFFSET_CAPTURE 00380 ) ) { 00381 return $m[0][1] + $s + $offset; 00382 } else { 00383 // check if point is on a valid first UTF8 char 00384 $char = ord( $text[$point] ); 00385 while ( $char >= 0x80 && $char < 0xc0 ) { 00386 // skip trailing bytes 00387 $point++; 00388 if ( $point >= strlen( $text ) ) { 00389 return strlen( $text ); 00390 } 00391 $char = ord( $text[$point] ); 00392 } 00393 00394 return $point; 00395 00396 } 00397 } 00398 00410 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) { 00411 if ( $linesleft == 0 ) { 00412 return; // nothing to do 00413 } 00414 foreach ( $extracts as $index => $line ) { 00415 if ( array_key_exists( $index, $out ) ) { 00416 continue; // this line already highlighted 00417 } 00418 00419 $m = array(); 00420 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) { 00421 continue; 00422 } 00423 00424 $offset = $m[0][1]; 00425 $len = strlen( $m[0][0] ); 00426 if ( $offset + $len < $contextchars ) { 00427 $begin = 0; 00428 } elseif ( $len > $contextchars ) { 00429 $begin = $offset; 00430 } else { 00431 $begin = $offset + intval( ( $len - $contextchars ) / 2 ); 00432 } 00433 00434 $end = $begin + $contextchars; 00435 00436 $posBegin = $begin; 00437 // basic snippet from this line 00438 $out[$index] = $this->extract( $line, $begin, $end, $posBegin ); 00439 $offsets[$index] = $posBegin; 00440 $linesleft--; 00441 if ( $linesleft == 0 ) { 00442 return; 00443 } 00444 } 00445 } 00446 00453 function removeWiki( $text ) { 00454 $fname = __METHOD__; 00455 wfProfileIn( $fname ); 00456 00457 // $text = preg_replace( "/'{2,5}/", "", $text ); 00458 // $text = preg_replace( "/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text ); 00459 // $text = preg_replace( "/\[\[([^]|]+)\]\]/", "\\1", $text ); 00460 // $text = preg_replace( "/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text ); 00461 // $text = preg_replace( "/\\{\\|(.*?)\\|\\}/", "", $text ); 00462 // $text = preg_replace( "/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text ); 00463 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text ); 00464 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text ); 00465 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text ); 00466 $text = preg_replace_callback( 00467 "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", 00468 array( $this, 'linkReplace' ), 00469 $text 00470 ); 00471 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text); 00472 $text = preg_replace( "/<\/?[^>]+>/", "", $text ); 00473 $text = preg_replace( "/'''''/", "", $text ); 00474 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text ); 00475 $text = preg_replace( "/''/", "", $text ); 00476 00477 wfProfileOut( $fname ); 00478 return $text; 00479 } 00480 00488 function linkReplace( $matches ) { 00489 $colon = strpos( $matches[1], ':' ); 00490 if ( $colon === false ) { 00491 return $matches[2]; // replace with caption 00492 } 00493 global $wgContLang; 00494 $ns = substr( $matches[1], 0, $colon ); 00495 $index = $wgContLang->getNsIndex( $ns ); 00496 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) { 00497 return $matches[0]; // return the whole thing 00498 } else { 00499 return $matches[2]; 00500 } 00501 } 00502 00513 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) { 00514 global $wgContLang; 00515 $fname = __METHOD__; 00516 00517 $lines = explode( "\n", $text ); 00518 00519 $terms = implode( '|', $terms ); 00520 $max = intval( $contextchars ) + 1; 00521 $pat1 = "/(.*)($terms)(.{0,$max})/i"; 00522 00523 $lineno = 0; 00524 00525 $extract = ""; 00526 wfProfileIn( "$fname-extract" ); 00527 foreach ( $lines as $line ) { 00528 if ( 0 == $contextlines ) { 00529 break; 00530 } 00531 ++$lineno; 00532 $m = array(); 00533 if ( !preg_match( $pat1, $line, $m ) ) { 00534 continue; 00535 } 00536 --$contextlines; 00537 // truncate function changes ... to relevant i18n message. 00538 $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false ); 00539 00540 if ( count( $m ) < 3 ) { 00541 $post = ''; 00542 } else { 00543 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false ); 00544 } 00545 00546 $found = $m[2]; 00547 00548 $line = htmlspecialchars( $pre . $found . $post ); 00549 $pat2 = '/(' . $terms . ")/i"; 00550 $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line ); 00551 00552 $extract .= "${line}\n"; 00553 } 00554 wfProfileOut( "$fname-extract" ); 00555 00556 return $extract; 00557 } 00558 00567 public function highlightNone( $text, $contextlines, $contextchars ) { 00568 $match = array(); 00569 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line 00570 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines 00571 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match ); 00572 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) ); // trim and limit to max number of chars 00573 return str_replace( "\n", '<br>', $text ); 00574 } 00575 }