[ Index ] |
PHP Cross Reference of MediaWiki-1.24.0 |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Basic search engine highlighting 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License along 16 * with this program; if not, write to the Free Software Foundation, Inc., 17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 * http://www.gnu.org/copyleft/gpl.html 19 * 20 * @file 21 * @ingroup Search 22 */ 23 24 /** 25 * Highlight bits of wikitext 26 * 27 * @ingroup Search 28 */ 29 class SearchHighlighter { 30 protected $mCleanWikitext = true; 31 32 function __construct( $cleanupWikitext = true ) { 33 $this->mCleanWikitext = $cleanupWikitext; 34 } 35 36 /** 37 * Default implementation of wikitext highlighting 38 * 39 * @param string $text 40 * @param array $terms Terms to highlight (unescaped) 41 * @param int $contextlines 42 * @param int $contextchars 43 * @return string 44 */ 45 public function highlightText( $text, $terms, $contextlines, $contextchars ) { 46 global $wgContLang, $wgSearchHighlightBoundaries; 47 48 $fname = __METHOD__; 49 50 if ( $text == '' ) { 51 return ''; 52 } 53 54 // spli text into text + templates/links/tables 55 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)"; 56 // first capture group is for detecting nested templates/links/tables/references 57 $endPatterns = array( 58 1 => '/(\{\{)|(\}\})/', // template 59 2 => '/(\[\[)|(\]\])/', // image 60 3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table 61 62 // @todo FIXME: This should prolly be a hook or something 63 if ( function_exists( 'wfCite' ) ) { 64 $spat .= '|(<ref>)'; // references via cite extension 65 $endPatterns[4] = '/(<ref>)|(<\/ref>)/'; 66 } 67 $spat .= '/'; 68 $textExt = array(); // text extracts 69 $otherExt = array(); // other extracts 70 wfProfileIn( "$fname-split" ); 71 $start = 0; 72 $textLen = strlen( $text ); 73 $count = 0; // sequence number to maintain ordering 74 while ( $start < $textLen ) { 75 // find start of template/image/table 76 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) { 77 $epat = ''; 78 foreach ( $matches as $key => $val ) { 79 if ( $key > 0 && $val[1] != - 1 ) { 80 if ( $key == 2 ) { 81 // see if this is an image link 82 $ns = substr( $val[0], 2, - 1 ); 83 if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) { 84 break; 85 } 86 87 } 88 $epat = $endPatterns[$key]; 89 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) ); 90 $start = $val[1]; 91 break; 92 } 93 } 94 if ( $epat ) { 95 // find end (and detect any nested elements) 96 $level = 0; 97 $offset = $start + 1; 98 $found = false; 99 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) { 100 if ( array_key_exists( 2, $endMatches ) ) { 101 // found end 102 if ( $level == 0 ) { 103 $len = strlen( $endMatches[2][0] ); 104 $off = $endMatches[2][1]; 105 $this->splitAndAdd( $otherExt, $count, 106 substr( $text, $start, $off + $len - $start ) ); 107 $start = $off + $len; 108 $found = true; 109 break; 110 } else { 111 // end of nested element 112 $level -= 1; 113 } 114 } else { 115 // nested 116 $level += 1; 117 } 118 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] ); 119 } 120 if ( !$found ) { 121 // couldn't find appropriate closing tag, skip 122 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) ); 123 $start += strlen( $matches[0][0] ); 124 } 125 continue; 126 } 127 } 128 // else: add as text extract 129 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) ); 130 break; 131 } 132 133 $all = $textExt + $otherExt; // these have disjunct key sets 134 135 wfProfileOut( "$fname-split" ); 136 137 // prepare regexps 138 foreach ( $terms as $index => $term ) { 139 // manually do upper/lowercase stuff for utf-8 since PHP won't do it 140 if ( preg_match( '/[\x80-\xff]/', $term ) ) { 141 $terms[$index] = preg_replace_callback( 142 '/./us', 143 array( $this, 'caseCallback' ), 144 $terms[$index] 145 ); 146 } else { 147 $terms[$index] = $term; 148 } 149 } 150 $anyterm = implode( '|', $terms ); 151 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms ); 152 153 // @todo FIXME: A hack to scale contextchars, a correct solution 154 // would be to have contextchars actually be char and not byte 155 // length, and do proper utf-8 substrings and lengths everywhere, 156 // but PHP is making that very hard and unclean to implement :( 157 $scale = strlen( $anyterm ) / mb_strlen( $anyterm ); 158 $contextchars = intval( $contextchars * $scale ); 159 160 $patPre = "(^|$wgSearchHighlightBoundaries)"; 161 $patPost = "($wgSearchHighlightBoundaries|$)"; 162 163 $pat1 = "/(" . $phrase . ")/ui"; 164 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui"; 165 166 wfProfileIn( "$fname-extract" ); 167 168 $left = $contextlines; 169 170 $snippets = array(); 171 $offsets = array(); 172 173 // show beginning only if it contains all words 174 $first = 0; 175 $firstText = ''; 176 foreach ( $textExt as $index => $line ) { 177 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) { 178 $firstText = $this->extract( $line, 0, $contextchars * $contextlines ); 179 $first = $index; 180 break; 181 } 182 } 183 if ( $firstText ) { 184 $succ = true; 185 // check if first text contains all terms 186 foreach ( $terms as $term ) { 187 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) { 188 $succ = false; 189 break; 190 } 191 } 192 if ( $succ ) { 193 $snippets[$first] = $firstText; 194 $offsets[$first] = 0; 195 } 196 } 197 if ( !$snippets ) { 198 // match whole query on text 199 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets ); 200 // match whole query on templates/tables/images 201 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets ); 202 // match any words on text 203 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets ); 204 // match any words on templates/tables/images 205 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets ); 206 207 ksort( $snippets ); 208 } 209 210 // add extra chars to each snippet to make snippets constant size 211 $extended = array(); 212 if ( count( $snippets ) == 0 ) { 213 // couldn't find the target words, just show beginning of article 214 if ( array_key_exists( $first, $all ) ) { 215 $targetchars = $contextchars * $contextlines; 216 $snippets[$first] = ''; 217 $offsets[$first] = 0; 218 } 219 } else { 220 // if begin of the article contains the whole phrase, show only that !! 221 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] ) 222 && $offsets[$first] < $contextchars * 2 ) { 223 $snippets = array( $first => $snippets[$first] ); 224 } 225 226 // calc by how much to extend existing snippets 227 $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) ); 228 } 229 230 foreach ( $snippets as $index => $line ) { 231 $extended[$index] = $line; 232 $len = strlen( $line ); 233 if ( $len < $targetchars - 20 ) { 234 // complete this line 235 if ( $len < strlen( $all[$index] ) ) { 236 $extended[$index] = $this->extract( 237 $all[$index], 238 $offsets[$index], 239 $offsets[$index] + $targetchars, 240 $offsets[$index] 241 ); 242 $len = strlen( $extended[$index] ); 243 } 244 245 // add more lines 246 $add = $index + 1; 247 while ( $len < $targetchars - 20 248 && array_key_exists( $add, $all ) 249 && !array_key_exists( $add, $snippets ) ) { 250 $offsets[$add] = 0; 251 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] ); 252 $extended[$add] = $tt; 253 $len += strlen( $tt ); 254 $add++; 255 } 256 } 257 } 258 259 // $snippets = array_map( 'htmlspecialchars', $extended ); 260 $snippets = $extended; 261 $last = - 1; 262 $extract = ''; 263 foreach ( $snippets as $index => $line ) { 264 if ( $last == - 1 ) { 265 $extract .= $line; // first line 266 } elseif ( $last + 1 == $index 267 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) 268 ) { 269 $extract .= " " . $line; // continous lines 270 } else { 271 $extract .= '<b> ... </b>' . $line; 272 } 273 274 $last = $index; 275 } 276 if ( $extract ) { 277 $extract .= '<b> ... </b>'; 278 } 279 280 $processed = array(); 281 foreach ( $terms as $term ) { 282 if ( !isset( $processed[$term] ) ) { 283 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word 284 $extract = preg_replace( $pat3, 285 "\\1<span class='searchmatch'>\\2</span>\\3", $extract ); 286 $processed[$term] = true; 287 } 288 } 289 290 wfProfileOut( "$fname-extract" ); 291 292 return $extract; 293 } 294 295 /** 296 * Split text into lines and add it to extracts array 297 * 298 * @param array $extracts Index -> $line 299 * @param int $count 300 * @param string $text 301 */ 302 function splitAndAdd( &$extracts, &$count, $text ) { 303 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text ); 304 foreach ( $split as $line ) { 305 $tt = trim( $line ); 306 if ( $tt ) { 307 $extracts[$count++] = $tt; 308 } 309 } 310 } 311 312 /** 313 * Do manual case conversion for non-ascii chars 314 * 315 * @param array $matches 316 * @return string 317 */ 318 function caseCallback( $matches ) { 319 global $wgContLang; 320 if ( strlen( $matches[0] ) > 1 ) { 321 return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']'; 322 } else { 323 return $matches[0]; 324 } 325 } 326 327 /** 328 * Extract part of the text from start to end, but by 329 * not chopping up words 330 * @param string $text 331 * @param int $start 332 * @param int $end 333 * @param int $posStart (out) actual start position 334 * @param int $posEnd (out) actual end position 335 * @return string 336 */ 337 function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) { 338 if ( $start != 0 ) { 339 $start = $this->position( $text, $start, 1 ); 340 } 341 if ( $end >= strlen( $text ) ) { 342 $end = strlen( $text ); 343 } else { 344 $end = $this->position( $text, $end ); 345 } 346 347 if ( !is_null( $posStart ) ) { 348 $posStart = $start; 349 } 350 if ( !is_null( $posEnd ) ) { 351 $posEnd = $end; 352 } 353 354 if ( $end > $start ) { 355 return substr( $text, $start, $end - $start ); 356 } else { 357 return ''; 358 } 359 } 360 361 /** 362 * Find a nonletter near a point (index) in the text 363 * 364 * @param string $text 365 * @param int $point 366 * @param int $offset Offset to found index 367 * @return int Nearest nonletter index, or beginning of utf8 char if none 368 */ 369 function position( $text, $point, $offset = 0 ) { 370 $tolerance = 10; 371 $s = max( 0, $point - $tolerance ); 372 $l = min( strlen( $text ), $point + $tolerance ) - $s; 373 $m = array(); 374 375 if ( preg_match( 376 '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', 377 substr( $text, $s, $l ), 378 $m, 379 PREG_OFFSET_CAPTURE 380 ) ) { 381 return $m[0][1] + $s + $offset; 382 } else { 383 // check if point is on a valid first UTF8 char 384 $char = ord( $text[$point] ); 385 while ( $char >= 0x80 && $char < 0xc0 ) { 386 // skip trailing bytes 387 $point++; 388 if ( $point >= strlen( $text ) ) { 389 return strlen( $text ); 390 } 391 $char = ord( $text[$point] ); 392 } 393 394 return $point; 395 396 } 397 } 398 399 /** 400 * Search extracts for a pattern, and return snippets 401 * 402 * @param string $pattern Regexp for matching lines 403 * @param array $extracts Extracts to search 404 * @param int $linesleft Number of extracts to make 405 * @param int $contextchars Length of snippet 406 * @param array $out Map for highlighted snippets 407 * @param array $offsets Map of starting points of snippets 408 * @protected 409 */ 410 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) { 411 if ( $linesleft == 0 ) { 412 return; // nothing to do 413 } 414 foreach ( $extracts as $index => $line ) { 415 if ( array_key_exists( $index, $out ) ) { 416 continue; // this line already highlighted 417 } 418 419 $m = array(); 420 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) { 421 continue; 422 } 423 424 $offset = $m[0][1]; 425 $len = strlen( $m[0][0] ); 426 if ( $offset + $len < $contextchars ) { 427 $begin = 0; 428 } elseif ( $len > $contextchars ) { 429 $begin = $offset; 430 } else { 431 $begin = $offset + intval( ( $len - $contextchars ) / 2 ); 432 } 433 434 $end = $begin + $contextchars; 435 436 $posBegin = $begin; 437 // basic snippet from this line 438 $out[$index] = $this->extract( $line, $begin, $end, $posBegin ); 439 $offsets[$index] = $posBegin; 440 $linesleft--; 441 if ( $linesleft == 0 ) { 442 return; 443 } 444 } 445 } 446 447 /** 448 * Basic wikitext removal 449 * @protected 450 * @param string $text 451 * @return mixed 452 */ 453 function removeWiki( $text ) { 454 $fname = __METHOD__; 455 wfProfileIn( $fname ); 456 457 // $text = preg_replace( "/'{2,5}/", "", $text ); 458 // $text = preg_replace( "/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text ); 459 // $text = preg_replace( "/\[\[([^]|]+)\]\]/", "\\1", $text ); 460 // $text = preg_replace( "/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text ); 461 // $text = preg_replace( "/\\{\\|(.*?)\\|\\}/", "", $text ); 462 // $text = preg_replace( "/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text ); 463 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text ); 464 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text ); 465 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text ); 466 $text = preg_replace_callback( 467 "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", 468 array( $this, 'linkReplace' ), 469 $text 470 ); 471 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text); 472 $text = preg_replace( "/<\/?[^>]+>/", "", $text ); 473 $text = preg_replace( "/'''''/", "", $text ); 474 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text ); 475 $text = preg_replace( "/''/", "", $text ); 476 477 wfProfileOut( $fname ); 478 return $text; 479 } 480 481 /** 482 * callback to replace [[target|caption]] kind of links, if 483 * the target is category or image, leave it 484 * 485 * @param array $matches 486 * @return string 487 */ 488 function linkReplace( $matches ) { 489 $colon = strpos( $matches[1], ':' ); 490 if ( $colon === false ) { 491 return $matches[2]; // replace with caption 492 } 493 global $wgContLang; 494 $ns = substr( $matches[1], 0, $colon ); 495 $index = $wgContLang->getNsIndex( $ns ); 496 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) { 497 return $matches[0]; // return the whole thing 498 } else { 499 return $matches[2]; 500 } 501 } 502 503 /** 504 * Simple & fast snippet extraction, but gives completely unrelevant 505 * snippets 506 * 507 * @param string $text 508 * @param array $terms 509 * @param int $contextlines 510 * @param int $contextchars 511 * @return string 512 */ 513 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) { 514 global $wgContLang; 515 $fname = __METHOD__; 516 517 $lines = explode( "\n", $text ); 518 519 $terms = implode( '|', $terms ); 520 $max = intval( $contextchars ) + 1; 521 $pat1 = "/(.*)($terms)(.{0,$max})/i"; 522 523 $lineno = 0; 524 525 $extract = ""; 526 wfProfileIn( "$fname-extract" ); 527 foreach ( $lines as $line ) { 528 if ( 0 == $contextlines ) { 529 break; 530 } 531 ++$lineno; 532 $m = array(); 533 if ( !preg_match( $pat1, $line, $m ) ) { 534 continue; 535 } 536 --$contextlines; 537 // truncate function changes ... to relevant i18n message. 538 $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false ); 539 540 if ( count( $m ) < 3 ) { 541 $post = ''; 542 } else { 543 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false ); 544 } 545 546 $found = $m[2]; 547 548 $line = htmlspecialchars( $pre . $found . $post ); 549 $pat2 = '/(' . $terms . ")/i"; 550 $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line ); 551 552 $extract .= "$line}\n"; 553 } 554 wfProfileOut( "$fname-extract" ); 555 556 return $extract; 557 } 558 559 /** 560 * Returns the first few lines of the text 561 * 562 * @param string $text 563 * @param int $contextlines Max number of returned lines 564 * @param int $contextchars Average number of characters per line 565 * @return string 566 */ 567 public function highlightNone( $text, $contextlines, $contextchars ) { 568 $match = array(); 569 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line 570 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines 571 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match ); 572 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) ); // trim and limit to max number of chars 573 return str_replace( "\n", '<br>', $text ); 574 } 575 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Fri Nov 28 14:03:12 2014 | Cross-referenced by PHPXref 0.7.1 |