MediaWiki
REL1_24
|
00001 <?php 00026 class StringUtils { 00051 static function isUtf8( $value, $disableMbstring = false ) { 00052 $value = (string)$value; 00053 00054 // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above 00055 // U+10FFFF are incorrectly allowed, so we have to check for them separately. 00056 if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) { 00057 static $newPHP; 00058 if ( $newPHP === null ) { 00059 $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' ); 00060 } 00061 00062 return mb_check_encoding( $value, 'UTF-8' ) && 00063 ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 ); 00064 } 00065 00066 if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) { 00067 // String contains only ASCII characters, has to be valid 00068 return true; 00069 } 00070 00071 // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault) 00072 // for large input, we check for invalid sequences (<= 5 bytes) rather than valid 00073 // sequences, which can be as long as the input string is. Multiple short regexes are 00074 // used rather than a single long regex for performance. 00075 static $regexes; 00076 if ( $regexes === null ) { 00077 $cont = "[\x80-\xbf]"; 00078 $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here 00079 $regexes = array( 00080 // Continuation byte at the start 00081 "/^$cont/", 00082 00083 // ASCII byte followed by a continuation byte 00084 "/[\\x00-\x7f]$cont/S", 00085 00086 // Illegal byte 00087 "/[\xc0\xc1\xf5-\xff]/S", 00088 00089 // Invalid 2-byte sequence, or valid one then an extra continuation byte 00090 "/[\xc2-\xdf](?!$cont$after)/S", 00091 00092 // Invalid 3-byte sequence, or valid one then an extra continuation byte 00093 "/\xe0(?![\xa0-\xbf]$cont$after)/", 00094 "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S", 00095 "/\xed(?![\x80-\x9f]$cont$after)/", 00096 00097 // Invalid 4-byte sequence, or valid one then an extra continuation byte 00098 "/\xf0(?![\x90-\xbf]$cont{2}$after)/", 00099 "/[\xf1-\xf3](?!$cont{3}$after)/S", 00100 "/\xf4(?![\x80-\x8f]$cont{2}$after)/", 00101 ); 00102 } 00103 00104 foreach ( $regexes as $regex ) { 00105 if ( preg_match( $regex, $value ) !== 0 ) { 00106 return false; 00107 } 00108 } 00109 00110 return true; 00111 } 00112 00131 static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) { 00132 $segments = explode( $startDelim, $subject ); 00133 $output = array_shift( $segments ); 00134 foreach ( $segments as $s ) { 00135 $endDelimPos = strpos( $s, $endDelim ); 00136 if ( $endDelimPos === false ) { 00137 $output .= $startDelim . $s; 00138 } else { 00139 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) ); 00140 } 00141 } 00142 00143 return $output; 00144 } 00145 00168 static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, 00169 $subject, $flags = '' 00170 ) { 00171 $inputPos = 0; 00172 $outputPos = 0; 00173 $output = ''; 00174 $foundStart = false; 00175 $encStart = preg_quote( $startDelim, '!' ); 00176 $encEnd = preg_quote( $endDelim, '!' ); 00177 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp'; 00178 $endLength = strlen( $endDelim ); 00179 $m = array(); 00180 00181 while ( $inputPos < strlen( $subject ) && 00182 preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) 00183 ) { 00184 $tokenOffset = $m[0][1]; 00185 if ( $m[1][0] != '' ) { 00186 if ( $foundStart && 00187 $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 00188 ) { 00189 # An end match is present at the same location 00190 $tokenType = 'end'; 00191 $tokenLength = $endLength; 00192 } else { 00193 $tokenType = 'start'; 00194 $tokenLength = strlen( $m[0][0] ); 00195 } 00196 } elseif ( $m[2][0] != '' ) { 00197 $tokenType = 'end'; 00198 $tokenLength = strlen( $m[0][0] ); 00199 } else { 00200 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ ); 00201 } 00202 00203 if ( $tokenType == 'start' ) { 00204 # Only move the start position if we haven't already found a start 00205 # This means that START START END matches outer pair 00206 if ( !$foundStart ) { 00207 # Found start 00208 $inputPos = $tokenOffset + $tokenLength; 00209 # Write out the non-matching section 00210 $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos ); 00211 $outputPos = $tokenOffset; 00212 $contentPos = $inputPos; 00213 $foundStart = true; 00214 } else { 00215 # Move the input position past the *first character* of START, 00216 # to protect against missing END when it overlaps with START 00217 $inputPos = $tokenOffset + 1; 00218 } 00219 } elseif ( $tokenType == 'end' ) { 00220 if ( $foundStart ) { 00221 # Found match 00222 $output .= call_user_func( $callback, array( 00223 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ), 00224 substr( $subject, $contentPos, $tokenOffset - $contentPos ) 00225 ) ); 00226 $foundStart = false; 00227 } else { 00228 # Non-matching end, write it out 00229 $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos ); 00230 } 00231 $inputPos = $outputPos = $tokenOffset + $tokenLength; 00232 } else { 00233 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ ); 00234 } 00235 } 00236 if ( $outputPos < strlen( $subject ) ) { 00237 $output .= substr( $subject, $outputPos ); 00238 } 00239 00240 return $output; 00241 } 00242 00256 static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) { 00257 $replacer = new RegexlikeReplacer( $replace ); 00258 00259 return self::delimiterReplaceCallback( $startDelim, $endDelim, 00260 $replacer->cb(), $subject, $flags ); 00261 } 00262 00270 static function explodeMarkup( $separator, $text ) { 00271 $placeholder = "\x00"; 00272 00273 // Remove placeholder instances 00274 $text = str_replace( $placeholder, '', $text ); 00275 00276 // Replace instances of the separator inside HTML-like tags with the placeholder 00277 $replacer = new DoubleReplacer( $separator, $placeholder ); 00278 $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text ); 00279 00280 // Explode, then put the replaced separators back in 00281 $items = explode( $separator, $cleaned ); 00282 foreach ( $items as $i => $str ) { 00283 $items[$i] = str_replace( $placeholder, $separator, $str ); 00284 } 00285 00286 return $items; 00287 } 00288 00296 static function escapeRegexReplacement( $string ) { 00297 $string = str_replace( '\\', '\\\\', $string ); 00298 $string = str_replace( '$', '\\$', $string ); 00299 00300 return $string; 00301 } 00302 00310 static function explode( $separator, $subject ) { 00311 if ( substr_count( $subject, $separator ) > 1000 ) { 00312 return new ExplodeIterator( $separator, $subject ); 00313 } else { 00314 return new ArrayIterator( explode( $separator, $subject ) ); 00315 } 00316 } 00317 } 00318 00323 class Replacer { 00327 function cb() { 00328 return array( &$this, 'replace' ); 00329 } 00330 } 00331 00335 class RegexlikeReplacer extends Replacer { 00336 private $r; 00337 00341 function __construct( $r ) { 00342 $this->r = $r; 00343 } 00344 00349 function replace( $matches ) { 00350 $pairs = array(); 00351 foreach ( $matches as $i => $match ) { 00352 $pairs["\$$i"] = $match; 00353 } 00354 00355 return strtr( $this->r, $pairs ); 00356 } 00357 } 00358 00362 class DoubleReplacer extends Replacer { 00368 function __construct( $from, $to, $index = 0 ) { 00369 $this->from = $from; 00370 $this->to = $to; 00371 $this->index = $index; 00372 } 00373 00378 function replace( $matches ) { 00379 return str_replace( $this->from, $this->to, $matches[$this->index] ); 00380 } 00381 } 00382 00386 class HashtableReplacer extends Replacer { 00387 private $table, $index; 00388 00393 function __construct( $table, $index = 0 ) { 00394 $this->table = $table; 00395 $this->index = $index; 00396 } 00397 00402 function replace( $matches ) { 00403 return $this->table[$matches[$this->index]]; 00404 } 00405 } 00406 00411 class ReplacementArray { 00412 private $data = false; 00413 private $fss = false; 00414 00420 function __construct( $data = array() ) { 00421 $this->data = $data; 00422 } 00423 00427 function __sleep() { 00428 return array( 'data' ); 00429 } 00430 00431 function __wakeup() { 00432 $this->fss = false; 00433 } 00434 00439 function setArray( $data ) { 00440 $this->data = $data; 00441 $this->fss = false; 00442 } 00443 00447 function getArray() { 00448 return $this->data; 00449 } 00450 00456 function setPair( $from, $to ) { 00457 $this->data[$from] = $to; 00458 $this->fss = false; 00459 } 00460 00464 function mergeArray( $data ) { 00465 $this->data = array_merge( $this->data, $data ); 00466 $this->fss = false; 00467 } 00468 00472 function merge( $other ) { 00473 $this->data = array_merge( $this->data, $other->data ); 00474 $this->fss = false; 00475 } 00476 00480 function removePair( $from ) { 00481 unset( $this->data[$from] ); 00482 $this->fss = false; 00483 } 00484 00488 function removeArray( $data ) { 00489 foreach ( $data as $from => $to ) { 00490 $this->removePair( $from ); 00491 } 00492 $this->fss = false; 00493 } 00494 00499 function replace( $subject ) { 00500 if ( function_exists( 'fss_prep_replace' ) ) { 00501 wfProfileIn( __METHOD__ . '-fss' ); 00502 if ( $this->fss === false ) { 00503 $this->fss = fss_prep_replace( $this->data ); 00504 } 00505 $result = fss_exec_replace( $this->fss, $subject ); 00506 wfProfileOut( __METHOD__ . '-fss' ); 00507 } else { 00508 wfProfileIn( __METHOD__ . '-strtr' ); 00509 $result = strtr( $subject, $this->data ); 00510 wfProfileOut( __METHOD__ . '-strtr' ); 00511 } 00512 00513 return $result; 00514 } 00515 } 00516 00526 class ExplodeIterator implements Iterator { 00527 // The subject string 00528 private $subject, $subjectLength; 00529 00530 // The delimiter 00531 private $delim, $delimLength; 00532 00533 // The position of the start of the line 00534 private $curPos; 00535 00536 // The position after the end of the next delimiter 00537 private $endPos; 00538 00539 // The current token 00540 private $current; 00541 00547 function __construct( $delim, $subject ) { 00548 $this->subject = $subject; 00549 $this->delim = $delim; 00550 00551 // Micro-optimisation (theoretical) 00552 $this->subjectLength = strlen( $subject ); 00553 $this->delimLength = strlen( $delim ); 00554 00555 $this->rewind(); 00556 } 00557 00558 function rewind() { 00559 $this->curPos = 0; 00560 $this->endPos = strpos( $this->subject, $this->delim ); 00561 $this->refreshCurrent(); 00562 } 00563 00564 function refreshCurrent() { 00565 if ( $this->curPos === false ) { 00566 $this->current = false; 00567 } elseif ( $this->curPos >= $this->subjectLength ) { 00568 $this->current = ''; 00569 } elseif ( $this->endPos === false ) { 00570 $this->current = substr( $this->subject, $this->curPos ); 00571 } else { 00572 $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos ); 00573 } 00574 } 00575 00576 function current() { 00577 return $this->current; 00578 } 00579 00583 function key() { 00584 return $this->curPos; 00585 } 00586 00590 function next() { 00591 if ( $this->endPos === false ) { 00592 $this->curPos = false; 00593 } else { 00594 $this->curPos = $this->endPos + $this->delimLength; 00595 if ( $this->curPos >= $this->subjectLength ) { 00596 $this->endPos = false; 00597 } else { 00598 $this->endPos = strpos( $this->subject, $this->delim, $this->curPos ); 00599 } 00600 } 00601 $this->refreshCurrent(); 00602 00603 return $this->current; 00604 } 00605 00609 function valid() { 00610 return $this->curPos !== false; 00611 } 00612 }