[ Index ] |
PHP Cross Reference of MediaWiki-1.24.0 |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Methods to play with strings. 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License along 16 * with this program; if not, write to the Free Software Foundation, Inc., 17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 * http://www.gnu.org/copyleft/gpl.html 19 * 20 * @file 21 */ 22 23 /** 24 * A collection of static methods to play with strings. 25 */ 26 class StringUtils { 27 /** 28 * Test whether a string is valid UTF-8. 29 * 30 * The function check for invalid byte sequences, overlong encoding but 31 * not for different normalisations. 32 * 33 * This relies internally on the mbstring function mb_check_encoding() 34 * hardcoded to check against UTF-8. Whenever the function is not available 35 * we fallback to a pure PHP implementation. Setting $disableMbstring to 36 * true will skip the use of mb_check_encoding, this is mostly intended for 37 * unit testing our internal implementation. 38 * 39 * @since 1.21 40 * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation. 41 * In particular, the pure PHP code path did not in fact check for overlong forms. 42 * Beware of this when backporting code to that version of MediaWiki. 43 * 44 * @param string $value String to check 45 * @param bool $disableMbstring Whether to use the pure PHP 46 * implementation instead of trying mb_check_encoding. Intended for unit 47 * testing. Default: false 48 * 49 * @return bool Whether the given $value is a valid UTF-8 encoded string 50 */ 51 static function isUtf8( $value, $disableMbstring = false ) { 52 $value = (string)$value; 53 54 // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above 55 // U+10FFFF are incorrectly allowed, so we have to check for them separately. 56 if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) { 57 static $newPHP; 58 if ( $newPHP === null ) { 59 $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' ); 60 } 61 62 return mb_check_encoding( $value, 'UTF-8' ) && 63 ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 ); 64 } 65 66 if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) { 67 // String contains only ASCII characters, has to be valid 68 return true; 69 } 70 71 // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault) 72 // for large input, we check for invalid sequences (<= 5 bytes) rather than valid 73 // sequences, which can be as long as the input string is. Multiple short regexes are 74 // used rather than a single long regex for performance. 75 static $regexes; 76 if ( $regexes === null ) { 77 $cont = "[\x80-\xbf]"; 78 $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here 79 $regexes = array( 80 // Continuation byte at the start 81 "/^$cont/", 82 83 // ASCII byte followed by a continuation byte 84 "/[\\x00-\x7f]$cont/S", 85 86 // Illegal byte 87 "/[\xc0\xc1\xf5-\xff]/S", 88 89 // Invalid 2-byte sequence, or valid one then an extra continuation byte 90 "/[\xc2-\xdf](?!$cont$after)/S", 91 92 // Invalid 3-byte sequence, or valid one then an extra continuation byte 93 "/\xe0(?![\xa0-\xbf]$cont$after)/", 94 "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S", 95 "/\xed(?![\x80-\x9f]$cont$after)/", 96 97 // Invalid 4-byte sequence, or valid one then an extra continuation byte 98 "/\xf0(?![\x90-\xbf]$cont{2}$after)/", 99 "/[\xf1-\xf3](?!$cont{3}$after)/S", 100 "/\xf4(?![\x80-\x8f]$cont{2}$after)/", 101 ); 102 } 103 104 foreach ( $regexes as $regex ) { 105 if ( preg_match( $regex, $value ) !== 0 ) { 106 return false; 107 } 108 } 109 110 return true; 111 } 112 113 /** 114 * Perform an operation equivalent to 115 * 116 * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject ); 117 * 118 * except that it's worst-case O(N) instead of O(N^2) 119 * 120 * Compared to delimiterReplace(), this implementation is fast but memory- 121 * hungry and inflexible. The memory requirements are such that I don't 122 * recommend using it on anything but guaranteed small chunks of text. 123 * 124 * @param string $startDelim 125 * @param string $endDelim 126 * @param string $replace 127 * @param string $subject 128 * 129 * @return string 130 */ 131 static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) { 132 $segments = explode( $startDelim, $subject ); 133 $output = array_shift( $segments ); 134 foreach ( $segments as $s ) { 135 $endDelimPos = strpos( $s, $endDelim ); 136 if ( $endDelimPos === false ) { 137 $output .= $startDelim . $s; 138 } else { 139 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) ); 140 } 141 } 142 143 return $output; 144 } 145 146 /** 147 * Perform an operation equivalent to 148 * 149 * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject ) 150 * 151 * This implementation is slower than hungryDelimiterReplace but uses far less 152 * memory. The delimiters are literal strings, not regular expressions. 153 * 154 * If the start delimiter ends with an initial substring of the end delimiter, 155 * e.g. in the case of C-style comments, the behavior differs from the model 156 * regex. In this implementation, the end must share no characters with the 157 * start, so e.g. /*\/ is not considered to be both the start and end of a 158 * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/. 159 * 160 * @param string $startDelim Start delimiter 161 * @param string $endDelim End delimiter 162 * @param callable $callback Function to call on each match 163 * @param string $subject 164 * @param string $flags Regular expression flags 165 * @throws MWException 166 * @return string 167 */ 168 static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, 169 $subject, $flags = '' 170 ) { 171 $inputPos = 0; 172 $outputPos = 0; 173 $output = ''; 174 $foundStart = false; 175 $encStart = preg_quote( $startDelim, '!' ); 176 $encEnd = preg_quote( $endDelim, '!' ); 177 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp'; 178 $endLength = strlen( $endDelim ); 179 $m = array(); 180 181 while ( $inputPos < strlen( $subject ) && 182 preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) 183 ) { 184 $tokenOffset = $m[0][1]; 185 if ( $m[1][0] != '' ) { 186 if ( $foundStart && 187 $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 188 ) { 189 # An end match is present at the same location 190 $tokenType = 'end'; 191 $tokenLength = $endLength; 192 } else { 193 $tokenType = 'start'; 194 $tokenLength = strlen( $m[0][0] ); 195 } 196 } elseif ( $m[2][0] != '' ) { 197 $tokenType = 'end'; 198 $tokenLength = strlen( $m[0][0] ); 199 } else { 200 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ ); 201 } 202 203 if ( $tokenType == 'start' ) { 204 # Only move the start position if we haven't already found a start 205 # This means that START START END matches outer pair 206 if ( !$foundStart ) { 207 # Found start 208 $inputPos = $tokenOffset + $tokenLength; 209 # Write out the non-matching section 210 $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos ); 211 $outputPos = $tokenOffset; 212 $contentPos = $inputPos; 213 $foundStart = true; 214 } else { 215 # Move the input position past the *first character* of START, 216 # to protect against missing END when it overlaps with START 217 $inputPos = $tokenOffset + 1; 218 } 219 } elseif ( $tokenType == 'end' ) { 220 if ( $foundStart ) { 221 # Found match 222 $output .= call_user_func( $callback, array( 223 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ), 224 substr( $subject, $contentPos, $tokenOffset - $contentPos ) 225 ) ); 226 $foundStart = false; 227 } else { 228 # Non-matching end, write it out 229 $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos ); 230 } 231 $inputPos = $outputPos = $tokenOffset + $tokenLength; 232 } else { 233 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ ); 234 } 235 } 236 if ( $outputPos < strlen( $subject ) ) { 237 $output .= substr( $subject, $outputPos ); 238 } 239 240 return $output; 241 } 242 243 /** 244 * Perform an operation equivalent to 245 * 246 * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject ) 247 * 248 * @param string $startDelim Start delimiter regular expression 249 * @param string $endDelim End delimiter regular expression 250 * @param string $replace Replacement string. May contain $1, which will be 251 * replaced by the text between the delimiters 252 * @param string $subject String to search 253 * @param string $flags Regular expression flags 254 * @return string The string with the matches replaced 255 */ 256 static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) { 257 $replacer = new RegexlikeReplacer( $replace ); 258 259 return self::delimiterReplaceCallback( $startDelim, $endDelim, 260 $replacer->cb(), $subject, $flags ); 261 } 262 263 /** 264 * More or less "markup-safe" explode() 265 * Ignores any instances of the separator inside <...> 266 * @param string $separator 267 * @param string $text 268 * @return array 269 */ 270 static function explodeMarkup( $separator, $text ) { 271 $placeholder = "\x00"; 272 273 // Remove placeholder instances 274 $text = str_replace( $placeholder, '', $text ); 275 276 // Replace instances of the separator inside HTML-like tags with the placeholder 277 $replacer = new DoubleReplacer( $separator, $placeholder ); 278 $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text ); 279 280 // Explode, then put the replaced separators back in 281 $items = explode( $separator, $cleaned ); 282 foreach ( $items as $i => $str ) { 283 $items[$i] = str_replace( $placeholder, $separator, $str ); 284 } 285 286 return $items; 287 } 288 289 /** 290 * Escape a string to make it suitable for inclusion in a preg_replace() 291 * replacement parameter. 292 * 293 * @param string $string 294 * @return string 295 */ 296 static function escapeRegexReplacement( $string ) { 297 $string = str_replace( '\\', '\\\\', $string ); 298 $string = str_replace( '$', '\\$', $string ); 299 300 return $string; 301 } 302 303 /** 304 * Workalike for explode() with limited memory usage. 305 * Returns an Iterator 306 * @param string $separator 307 * @param string $subject 308 * @return ArrayIterator|ExplodeIterator 309 */ 310 static function explode( $separator, $subject ) { 311 if ( substr_count( $subject, $separator ) > 1000 ) { 312 return new ExplodeIterator( $separator, $subject ); 313 } else { 314 return new ArrayIterator( explode( $separator, $subject ) ); 315 } 316 } 317 } 318 319 /** 320 * Base class for "replacers", objects used in preg_replace_callback() and 321 * StringUtils::delimiterReplaceCallback() 322 */ 323 class Replacer { 324 /** 325 * @return array 326 */ 327 function cb() { 328 return array( &$this, 'replace' ); 329 } 330 } 331 332 /** 333 * Class to replace regex matches with a string similar to that used in preg_replace() 334 */ 335 class RegexlikeReplacer extends Replacer { 336 private $r; 337 338 /** 339 * @param string $r 340 */ 341 function __construct( $r ) { 342 $this->r = $r; 343 } 344 345 /** 346 * @param array $matches 347 * @return string 348 */ 349 function replace( $matches ) { 350 $pairs = array(); 351 foreach ( $matches as $i => $match ) { 352 $pairs["\$$i"] = $match; 353 } 354 355 return strtr( $this->r, $pairs ); 356 } 357 } 358 359 /** 360 * Class to perform secondary replacement within each replacement string 361 */ 362 class DoubleReplacer extends Replacer { 363 /** 364 * @param mixed $from 365 * @param mixed $to 366 * @param int $index 367 */ 368 function __construct( $from, $to, $index = 0 ) { 369 $this->from = $from; 370 $this->to = $to; 371 $this->index = $index; 372 } 373 374 /** 375 * @param array $matches 376 * @return mixed 377 */ 378 function replace( $matches ) { 379 return str_replace( $this->from, $this->to, $matches[$this->index] ); 380 } 381 } 382 383 /** 384 * Class to perform replacement based on a simple hashtable lookup 385 */ 386 class HashtableReplacer extends Replacer { 387 private $table, $index; 388 389 /** 390 * @param array $table 391 * @param int $index 392 */ 393 function __construct( $table, $index = 0 ) { 394 $this->table = $table; 395 $this->index = $index; 396 } 397 398 /** 399 * @param array $matches 400 * @return mixed 401 */ 402 function replace( $matches ) { 403 return $this->table[$matches[$this->index]]; 404 } 405 } 406 407 /** 408 * Replacement array for FSS with fallback to strtr() 409 * Supports lazy initialisation of FSS resource 410 */ 411 class ReplacementArray { 412 private $data = false; 413 private $fss = false; 414 415 /** 416 * Create an object with the specified replacement array 417 * The array should have the same form as the replacement array for strtr() 418 * @param array $data 419 */ 420 function __construct( $data = array() ) { 421 $this->data = $data; 422 } 423 424 /** 425 * @return array 426 */ 427 function __sleep() { 428 return array( 'data' ); 429 } 430 431 function __wakeup() { 432 $this->fss = false; 433 } 434 435 /** 436 * Set the whole replacement array at once 437 * @param array $data 438 */ 439 function setArray( $data ) { 440 $this->data = $data; 441 $this->fss = false; 442 } 443 444 /** 445 * @return array|bool 446 */ 447 function getArray() { 448 return $this->data; 449 } 450 451 /** 452 * Set an element of the replacement array 453 * @param string $from 454 * @param string $to 455 */ 456 function setPair( $from, $to ) { 457 $this->data[$from] = $to; 458 $this->fss = false; 459 } 460 461 /** 462 * @param array $data 463 */ 464 function mergeArray( $data ) { 465 $this->data = array_merge( $this->data, $data ); 466 $this->fss = false; 467 } 468 469 /** 470 * @param ReplacementArray $other 471 */ 472 function merge( $other ) { 473 $this->data = array_merge( $this->data, $other->data ); 474 $this->fss = false; 475 } 476 477 /** 478 * @param string $from 479 */ 480 function removePair( $from ) { 481 unset( $this->data[$from] ); 482 $this->fss = false; 483 } 484 485 /** 486 * @param array $data 487 */ 488 function removeArray( $data ) { 489 foreach ( $data as $from => $to ) { 490 $this->removePair( $from ); 491 } 492 $this->fss = false; 493 } 494 495 /** 496 * @param string $subject 497 * @return string 498 */ 499 function replace( $subject ) { 500 if ( function_exists( 'fss_prep_replace' ) ) { 501 wfProfileIn( __METHOD__ . '-fss' ); 502 if ( $this->fss === false ) { 503 $this->fss = fss_prep_replace( $this->data ); 504 } 505 $result = fss_exec_replace( $this->fss, $subject ); 506 wfProfileOut( __METHOD__ . '-fss' ); 507 } else { 508 wfProfileIn( __METHOD__ . '-strtr' ); 509 $result = strtr( $subject, $this->data ); 510 wfProfileOut( __METHOD__ . '-strtr' ); 511 } 512 513 return $result; 514 } 515 } 516 517 /** 518 * An iterator which works exactly like: 519 * 520 * foreach ( explode( $delim, $s ) as $element ) { 521 * ... 522 * } 523 * 524 * Except it doesn't use 193 byte per element 525 */ 526 class ExplodeIterator implements Iterator { 527 // The subject string 528 private $subject, $subjectLength; 529 530 // The delimiter 531 private $delim, $delimLength; 532 533 // The position of the start of the line 534 private $curPos; 535 536 // The position after the end of the next delimiter 537 private $endPos; 538 539 // The current token 540 private $current; 541 542 /** 543 * Construct a DelimIterator 544 * @param string $delim 545 * @param string $subject 546 */ 547 function __construct( $delim, $subject ) { 548 $this->subject = $subject; 549 $this->delim = $delim; 550 551 // Micro-optimisation (theoretical) 552 $this->subjectLength = strlen( $subject ); 553 $this->delimLength = strlen( $delim ); 554 555 $this->rewind(); 556 } 557 558 function rewind() { 559 $this->curPos = 0; 560 $this->endPos = strpos( $this->subject, $this->delim ); 561 $this->refreshCurrent(); 562 } 563 564 function refreshCurrent() { 565 if ( $this->curPos === false ) { 566 $this->current = false; 567 } elseif ( $this->curPos >= $this->subjectLength ) { 568 $this->current = ''; 569 } elseif ( $this->endPos === false ) { 570 $this->current = substr( $this->subject, $this->curPos ); 571 } else { 572 $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos ); 573 } 574 } 575 576 function current() { 577 return $this->current; 578 } 579 580 /** 581 * @return int|bool Current position or boolean false if invalid 582 */ 583 function key() { 584 return $this->curPos; 585 } 586 587 /** 588 * @return string 589 */ 590 function next() { 591 if ( $this->endPos === false ) { 592 $this->curPos = false; 593 } else { 594 $this->curPos = $this->endPos + $this->delimLength; 595 if ( $this->curPos >= $this->subjectLength ) { 596 $this->endPos = false; 597 } else { 598 $this->endPos = strpos( $this->subject, $this->delim, $this->curPos ); 599 } 600 } 601 $this->refreshCurrent(); 602 603 return $this->current; 604 } 605 606 /** 607 * @return bool 608 */ 609 function valid() { 610 return $this->curPos !== false; 611 } 612 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Fri Nov 28 14:03:12 2014 | Cross-referenced by PHPXref 0.7.1 |