MediaWiki
REL1_23
|
00001 <?php 00029 class SearchHighlighter { 00030 var $mCleanWikitext = true; 00031 00032 function __construct( $cleanupWikitext = true ) { 00033 $this->mCleanWikitext = $cleanupWikitext; 00034 } 00035 00045 public function highlightText( $text, $terms, $contextlines, $contextchars ) { 00046 global $wgContLang; 00047 global $wgSearchHighlightBoundaries; 00048 $fname = __METHOD__; 00049 00050 if ( $text == '' ) { 00051 return ''; 00052 } 00053 00054 // spli text into text + templates/links/tables 00055 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)"; 00056 // first capture group is for detecting nested templates/links/tables/references 00057 $endPatterns = array( 00058 1 => '/(\{\{)|(\}\})/', // template 00059 2 => '/(\[\[)|(\]\])/', // image 00060 3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table 00061 00062 // @todo FIXME: This should prolly be a hook or something 00063 if ( function_exists( 'wfCite' ) ) { 00064 $spat .= '|(<ref>)'; // references via cite extension 00065 $endPatterns[4] = '/(<ref>)|(<\/ref>)/'; 00066 } 00067 $spat .= '/'; 00068 $textExt = array(); // text extracts 00069 $otherExt = array(); // other extracts 00070 wfProfileIn( "$fname-split" ); 00071 $start = 0; 00072 $textLen = strlen( $text ); 00073 $count = 0; // sequence number to maintain ordering 00074 while ( $start < $textLen ) { 00075 // find start of template/image/table 00076 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) { 00077 $epat = ''; 00078 foreach ( $matches as $key => $val ) { 00079 if ( $key > 0 && $val[1] != - 1 ) { 00080 if ( $key == 2 ) { 00081 // see if this is an image link 00082 $ns = substr( $val[0], 2, - 1 ); 00083 if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) { 00084 break; 00085 } 00086 00087 } 00088 $epat = $endPatterns[$key]; 00089 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) ); 00090 $start = $val[1]; 00091 break; 00092 } 00093 } 00094 if ( $epat ) { 00095 // find end (and detect any nested elements) 00096 $level = 0; 00097 $offset = $start + 1; 00098 $found = false; 00099 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) { 00100 if ( array_key_exists( 2, $endMatches ) ) { 00101 // found end 00102 if ( $level == 0 ) { 00103 $len = strlen( $endMatches[2][0] ); 00104 $off = $endMatches[2][1]; 00105 $this->splitAndAdd( $otherExt, $count, 00106 substr( $text, $start, $off + $len - $start ) ); 00107 $start = $off + $len; 00108 $found = true; 00109 break; 00110 } else { 00111 // end of nested element 00112 $level -= 1; 00113 } 00114 } else { 00115 // nested 00116 $level += 1; 00117 } 00118 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] ); 00119 } 00120 if ( ! $found ) { 00121 // couldn't find appropriate closing tag, skip 00122 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) ); 00123 $start += strlen( $matches[0][0] ); 00124 } 00125 continue; 00126 } 00127 } 00128 // else: add as text extract 00129 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) ); 00130 break; 00131 } 00132 00133 $all = $textExt + $otherExt; // these have disjunct key sets 00134 00135 wfProfileOut( "$fname-split" ); 00136 00137 // prepare regexps 00138 foreach ( $terms as $index => $term ) { 00139 // manually do upper/lowercase stuff for utf-8 since PHP won't do it 00140 if ( preg_match( '/[\x80-\xff]/', $term ) ) { 00141 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] ); 00142 } else { 00143 $terms[$index] = $term; 00144 } 00145 } 00146 $anyterm = implode( '|', $terms ); 00147 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms ); 00148 00149 // @todo FIXME: A hack to scale contextchars, a correct solution 00150 // would be to have contextchars actually be char and not byte 00151 // length, and do proper utf-8 substrings and lengths everywhere, 00152 // but PHP is making that very hard and unclean to implement :( 00153 $scale = strlen( $anyterm ) / mb_strlen( $anyterm ); 00154 $contextchars = intval( $contextchars * $scale ); 00155 00156 $patPre = "(^|$wgSearchHighlightBoundaries)"; 00157 $patPost = "($wgSearchHighlightBoundaries|$)"; 00158 00159 $pat1 = "/(" . $phrase . ")/ui"; 00160 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui"; 00161 00162 wfProfileIn( "$fname-extract" ); 00163 00164 $left = $contextlines; 00165 00166 $snippets = array(); 00167 $offsets = array(); 00168 00169 // show beginning only if it contains all words 00170 $first = 0; 00171 $firstText = ''; 00172 foreach ( $textExt as $index => $line ) { 00173 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) { 00174 $firstText = $this->extract( $line, 0, $contextchars * $contextlines ); 00175 $first = $index; 00176 break; 00177 } 00178 } 00179 if ( $firstText ) { 00180 $succ = true; 00181 // check if first text contains all terms 00182 foreach ( $terms as $term ) { 00183 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) { 00184 $succ = false; 00185 break; 00186 } 00187 } 00188 if ( $succ ) { 00189 $snippets[$first] = $firstText; 00190 $offsets[$first] = 0; 00191 } 00192 } 00193 if ( ! $snippets ) { 00194 // match whole query on text 00195 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets ); 00196 // match whole query on templates/tables/images 00197 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets ); 00198 // match any words on text 00199 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets ); 00200 // match any words on templates/tables/images 00201 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets ); 00202 00203 ksort( $snippets ); 00204 } 00205 00206 // add extra chars to each snippet to make snippets constant size 00207 $extended = array(); 00208 if ( count( $snippets ) == 0 ) { 00209 // couldn't find the target words, just show beginning of article 00210 if ( array_key_exists( $first, $all ) ) { 00211 $targetchars = $contextchars * $contextlines; 00212 $snippets[$first] = ''; 00213 $offsets[$first] = 0; 00214 } 00215 } else { 00216 // if begin of the article contains the whole phrase, show only that !! 00217 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] ) 00218 && $offsets[$first] < $contextchars * 2 ) { 00219 $snippets = array( $first => $snippets[$first] ); 00220 } 00221 00222 // calc by how much to extend existing snippets 00223 $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) ); 00224 } 00225 00226 foreach ( $snippets as $index => $line ) { 00227 $extended[$index] = $line; 00228 $len = strlen( $line ); 00229 if ( $len < $targetchars - 20 ) { 00230 // complete this line 00231 if ( $len < strlen( $all[$index] ) ) { 00232 $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] ); 00233 $len = strlen( $extended[$index] ); 00234 } 00235 00236 // add more lines 00237 $add = $index + 1; 00238 while ( $len < $targetchars - 20 00239 && array_key_exists( $add, $all ) 00240 && !array_key_exists( $add, $snippets ) ) { 00241 $offsets[$add] = 0; 00242 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] ); 00243 $extended[$add] = $tt; 00244 $len += strlen( $tt ); 00245 $add++; 00246 } 00247 } 00248 } 00249 00250 // $snippets = array_map( 'htmlspecialchars', $extended ); 00251 $snippets = $extended; 00252 $last = - 1; 00253 $extract = ''; 00254 foreach ( $snippets as $index => $line ) { 00255 if ( $last == - 1 ) { 00256 $extract .= $line; // first line 00257 } elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) ) { 00258 $extract .= " " . $line; // continous lines 00259 } else { 00260 $extract .= '<b> ... </b>' . $line; 00261 } 00262 00263 $last = $index; 00264 } 00265 if ( $extract ) { 00266 $extract .= '<b> ... </b>'; 00267 } 00268 00269 $processed = array(); 00270 foreach ( $terms as $term ) { 00271 if ( ! isset( $processed[$term] ) ) { 00272 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word 00273 $extract = preg_replace( $pat3, 00274 "\\1<span class='searchmatch'>\\2</span>\\3", $extract ); 00275 $processed[$term] = true; 00276 } 00277 } 00278 00279 wfProfileOut( "$fname-extract" ); 00280 00281 return $extract; 00282 } 00283 00291 function splitAndAdd( &$extracts, &$count, $text ) { 00292 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text ); 00293 foreach ( $split as $line ) { 00294 $tt = trim( $line ); 00295 if ( $tt ) { 00296 $extracts[$count++] = $tt; 00297 } 00298 } 00299 } 00300 00307 function caseCallback( $matches ) { 00308 global $wgContLang; 00309 if ( strlen( $matches[0] ) > 1 ) { 00310 return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']'; 00311 } else { 00312 return $matches[0]; 00313 } 00314 } 00315 00326 function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) { 00327 if ( $start != 0 ) { 00328 $start = $this->position( $text, $start, 1 ); 00329 } 00330 if ( $end >= strlen( $text ) ) { 00331 $end = strlen( $text ); 00332 } else { 00333 $end = $this->position( $text, $end ); 00334 } 00335 00336 if ( !is_null( $posStart ) ) { 00337 $posStart = $start; 00338 } 00339 if ( !is_null( $posEnd ) ) { 00340 $posEnd = $end; 00341 } 00342 00343 if ( $end > $start ) { 00344 return substr( $text, $start, $end - $start ); 00345 } else { 00346 return ''; 00347 } 00348 } 00349 00358 function position( $text, $point, $offset = 0 ) { 00359 $tolerance = 10; 00360 $s = max( 0, $point - $tolerance ); 00361 $l = min( strlen( $text ), $point + $tolerance ) - $s; 00362 $m = array(); 00363 if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) { 00364 return $m[0][1] + $s + $offset; 00365 } else { 00366 // check if point is on a valid first UTF8 char 00367 $char = ord( $text[$point] ); 00368 while ( $char >= 0x80 && $char < 0xc0 ) { 00369 // skip trailing bytes 00370 $point++; 00371 if ( $point >= strlen( $text ) ) { 00372 return strlen( $text ); 00373 } 00374 $char = ord( $text[$point] ); 00375 } 00376 return $point; 00377 00378 } 00379 } 00380 00392 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) { 00393 if ( $linesleft == 0 ) { 00394 return; // nothing to do 00395 } 00396 foreach ( $extracts as $index => $line ) { 00397 if ( array_key_exists( $index, $out ) ) { 00398 continue; // this line already highlighted 00399 } 00400 00401 $m = array(); 00402 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) { 00403 continue; 00404 } 00405 00406 $offset = $m[0][1]; 00407 $len = strlen( $m[0][0] ); 00408 if ( $offset + $len < $contextchars ) { 00409 $begin = 0; 00410 } elseif ( $len > $contextchars ) { 00411 $begin = $offset; 00412 } else { 00413 $begin = $offset + intval( ( $len - $contextchars ) / 2 ); 00414 } 00415 00416 $end = $begin + $contextchars; 00417 00418 $posBegin = $begin; 00419 // basic snippet from this line 00420 $out[$index] = $this->extract( $line, $begin, $end, $posBegin ); 00421 $offsets[$index] = $posBegin; 00422 $linesleft--; 00423 if ( $linesleft == 0 ) { 00424 return; 00425 } 00426 } 00427 } 00428 00434 function removeWiki( $text ) { 00435 $fname = __METHOD__; 00436 wfProfileIn( $fname ); 00437 00438 // $text = preg_replace( "/'{2,5}/", "", $text ); 00439 // $text = preg_replace( "/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text ); 00440 // $text = preg_replace( "/\[\[([^]|]+)\]\]/", "\\1", $text ); 00441 // $text = preg_replace( "/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text ); 00442 // $text = preg_replace( "/\\{\\|(.*?)\\|\\}/", "", $text ); 00443 // $text = preg_replace( "/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text ); 00444 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text ); 00445 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text ); 00446 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text ); 00447 $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text ); 00448 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text); 00449 $text = preg_replace( "/<\/?[^>]+>/", "", $text ); 00450 $text = preg_replace( "/'''''/", "", $text ); 00451 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text ); 00452 $text = preg_replace( "/''/", "", $text ); 00453 00454 wfProfileOut( $fname ); 00455 return $text; 00456 } 00457 00464 function linkReplace( $matches ) { 00465 $colon = strpos( $matches[1], ':' ); 00466 if ( $colon === false ) { 00467 return $matches[2]; // replace with caption 00468 } 00469 global $wgContLang; 00470 $ns = substr( $matches[1], 0, $colon ); 00471 $index = $wgContLang->getNsIndex( $ns ); 00472 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) { 00473 return $matches[0]; // return the whole thing 00474 } else { 00475 return $matches[2]; 00476 } 00477 } 00478 00489 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) { 00490 global $wgContLang; 00491 $fname = __METHOD__; 00492 00493 $lines = explode( "\n", $text ); 00494 00495 $terms = implode( '|', $terms ); 00496 $max = intval( $contextchars ) + 1; 00497 $pat1 = "/(.*)($terms)(.{0,$max})/i"; 00498 00499 $lineno = 0; 00500 00501 $extract = ""; 00502 wfProfileIn( "$fname-extract" ); 00503 foreach ( $lines as $line ) { 00504 if ( 0 == $contextlines ) { 00505 break; 00506 } 00507 ++$lineno; 00508 $m = array(); 00509 if ( ! preg_match( $pat1, $line, $m ) ) { 00510 continue; 00511 } 00512 --$contextlines; 00513 // truncate function changes ... to relevant i18n message. 00514 $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false ); 00515 00516 if ( count( $m ) < 3 ) { 00517 $post = ''; 00518 } else { 00519 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false ); 00520 } 00521 00522 $found = $m[2]; 00523 00524 $line = htmlspecialchars( $pre . $found . $post ); 00525 $pat2 = '/(' . $terms . ")/i"; 00526 $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line ); 00527 00528 $extract .= "${line}\n"; 00529 } 00530 wfProfileOut( "$fname-extract" ); 00531 00532 return $extract; 00533 } 00534 }