MediaWiki  REL1_21
StringUtils.php
Go to the documentation of this file.
00001 <?php
00026 class StringUtils {
00027 
00049         static function isUtf8( $value, $disableMbstring = false ) {
00050 
00051                 if ( preg_match( '/[\x80-\xff]/', $value ) === 0 ) {
00052                         # no high bit set, this is pure ASCII which is de facto
00053                         # valid UTF-8
00054                         return true;
00055                 }
00056 
00057                 if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
00058                         return mb_check_encoding( $value, 'UTF-8' );
00059                 } else {
00060                         $hasUtf8 = preg_match( '/^(?>
00061                                   [\x00-\x7f]
00062                                 | [\xc0-\xdf][\x80-\xbf]
00063                                 | [\xe0-\xef][\x80-\xbf]{2}
00064                                 | [\xf0-\xf7][\x80-\xbf]{3}
00065                                 | [\xf8-\xfb][\x80-\xbf]{4}
00066                                 | \xfc[\x84-\xbf][\x80-\xbf]{4}
00067                         )+$/x', $value );
00068                         return ($hasUtf8 > 0 );
00069                 }
00070         }
00071 
00090         static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
00091                 $segments = explode( $startDelim, $subject );
00092                 $output = array_shift( $segments );
00093                 foreach ( $segments as $s ) {
00094                         $endDelimPos = strpos( $s, $endDelim );
00095                         if ( $endDelimPos === false ) {
00096                                 $output .= $startDelim . $s;
00097                         } else {
00098                                 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
00099                         }
00100                 }
00101                 return $output;
00102         }
00103 
00126         static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, $subject, $flags = '' ) {
00127                 $inputPos = 0;
00128                 $outputPos = 0;
00129                 $output = '';
00130                 $foundStart = false;
00131                 $encStart = preg_quote( $startDelim, '!' );
00132                 $encEnd = preg_quote( $endDelim, '!' );
00133                 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
00134                 $endLength = strlen( $endDelim );
00135                 $m = array();
00136 
00137                 while ( $inputPos < strlen( $subject ) &&
00138                         preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) )
00139                 {
00140                         $tokenOffset = $m[0][1];
00141                         if ( $m[1][0] != '' ) {
00142                                 if ( $foundStart &&
00143                                         $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 )
00144                                 {
00145                                         # An end match is present at the same location
00146                                         $tokenType = 'end';
00147                                         $tokenLength = $endLength;
00148                                 } else {
00149                                         $tokenType = 'start';
00150                                         $tokenLength = strlen( $m[0][0] );
00151                                 }
00152                         } elseif ( $m[2][0] != '' ) {
00153                                 $tokenType = 'end';
00154                                 $tokenLength = strlen( $m[0][0] );
00155                         } else {
00156                                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
00157                         }
00158 
00159                         if ( $tokenType == 'start' ) {
00160                                 # Only move the start position if we haven't already found a start
00161                                 # This means that START START END matches outer pair
00162                                 if ( !$foundStart ) {
00163                                         # Found start
00164                                         $inputPos = $tokenOffset + $tokenLength;
00165                                         # Write out the non-matching section
00166                                         $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
00167                                         $outputPos = $tokenOffset;
00168                                         $contentPos = $inputPos;
00169                                         $foundStart = true;
00170                                 } else {
00171                                         # Move the input position past the *first character* of START,
00172                                         # to protect against missing END when it overlaps with START
00173                                         $inputPos = $tokenOffset + 1;
00174                                 }
00175                         } elseif ( $tokenType == 'end' ) {
00176                                 if ( $foundStart ) {
00177                                         # Found match
00178                                         $output .= call_user_func( $callback, array(
00179                                                 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
00180                                                 substr( $subject, $contentPos, $tokenOffset - $contentPos )
00181                                         ));
00182                                         $foundStart = false;
00183                                 } else {
00184                                         # Non-matching end, write it out
00185                                         $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
00186                                 }
00187                                 $inputPos = $outputPos = $tokenOffset + $tokenLength;
00188                         } else {
00189                                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
00190                         }
00191                 }
00192                 if ( $outputPos < strlen( $subject ) ) {
00193                         $output .= substr( $subject, $outputPos );
00194                 }
00195                 return $output;
00196         }
00197 
00211         static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
00212                 $replacer = new RegexlikeReplacer( $replace );
00213                 return self::delimiterReplaceCallback( $startDelim, $endDelim,
00214                         $replacer->cb(), $subject, $flags );
00215         }
00216 
00224         static function explodeMarkup( $separator, $text ) {
00225                 $placeholder = "\x00";
00226 
00227                 // Remove placeholder instances
00228                 $text = str_replace( $placeholder, '', $text );
00229 
00230                 // Replace instances of the separator inside HTML-like tags with the placeholder
00231                 $replacer = new DoubleReplacer( $separator, $placeholder );
00232                 $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
00233 
00234                 // Explode, then put the replaced separators back in
00235                 $items = explode( $separator, $cleaned );
00236                 foreach( $items as $i => $str ) {
00237                         $items[$i] = str_replace( $placeholder, $separator, $str );
00238                 }
00239 
00240                 return $items;
00241         }
00242 
00250         static function escapeRegexReplacement( $string ) {
00251                 $string = str_replace( '\\', '\\\\', $string );
00252                 $string = str_replace( '$', '\\$', $string );
00253                 return $string;
00254         }
00255 
00263         static function explode( $separator, $subject ) {
00264                 if ( substr_count( $subject, $separator ) > 1000 ) {
00265                         return new ExplodeIterator( $separator, $subject );
00266                 } else {
00267                         return new ArrayIterator( explode( $separator, $subject ) );
00268                 }
00269         }
00270 }
00271 
00276 class Replacer {
00277 
00281         function cb() {
00282                 return array( &$this, 'replace' );
00283         }
00284 }
00285 
00289 class RegexlikeReplacer extends Replacer {
00290         var $r;
00291 
00295         function __construct( $r ) {
00296                 $this->r = $r;
00297         }
00298 
00303         function replace( $matches ) {
00304                 $pairs = array();
00305                 foreach ( $matches as $i => $match ) {
00306                         $pairs["\$$i"] = $match;
00307                 }
00308                 return strtr( $this->r, $pairs );
00309         }
00310 
00311 }
00312 
00316 class DoubleReplacer extends Replacer {
00317 
00323         function __construct( $from, $to, $index = 0 ) {
00324                 $this->from = $from;
00325                 $this->to = $to;
00326                 $this->index = $index;
00327         }
00328 
00333         function replace( $matches ) {
00334                 return str_replace( $this->from, $this->to, $matches[$this->index] );
00335         }
00336 }
00337 
00341 class HashtableReplacer extends Replacer {
00342         var $table, $index;
00343 
00348         function __construct( $table, $index = 0 ) {
00349                 $this->table = $table;
00350                 $this->index = $index;
00351         }
00352 
00357         function replace( $matches ) {
00358                 return $this->table[$matches[$this->index]];
00359         }
00360 }
00361 
00366 class ReplacementArray {
00367         /*mostly private*/ var $data = false;
00368         /*mostly private*/ var $fss = false;
00369 
00375         function __construct( $data = array() ) {
00376                 $this->data = $data;
00377         }
00378 
00382         function __sleep() {
00383                 return array( 'data' );
00384         }
00385 
00386         function __wakeup() {
00387                 $this->fss = false;
00388         }
00389 
00393         function setArray( $data ) {
00394                 $this->data = $data;
00395                 $this->fss = false;
00396         }
00397 
00401         function getArray() {
00402                 return $this->data;
00403         }
00404 
00410         function setPair( $from, $to ) {
00411                 $this->data[$from] = $to;
00412                 $this->fss = false;
00413         }
00414 
00418         function mergeArray( $data ) {
00419                 $this->data = array_merge( $this->data, $data );
00420                 $this->fss = false;
00421         }
00422 
00426         function merge( $other ) {
00427                 $this->data = array_merge( $this->data, $other->data );
00428                 $this->fss = false;
00429         }
00430 
00434         function removePair( $from ) {
00435                 unset( $this->data[$from] );
00436                 $this->fss = false;
00437         }
00438 
00442         function removeArray( $data ) {
00443                 foreach( $data as $from => $to ) {
00444                         $this->removePair( $from );
00445                 }
00446                 $this->fss = false;
00447         }
00448 
00453         function replace( $subject ) {
00454                 if ( function_exists( 'fss_prep_replace' ) ) {
00455                         wfProfileIn( __METHOD__.'-fss' );
00456                         if ( $this->fss === false ) {
00457                                 $this->fss = fss_prep_replace( $this->data );
00458                         }
00459                         $result = fss_exec_replace( $this->fss, $subject );
00460                         wfProfileOut( __METHOD__.'-fss' );
00461                 } else {
00462                         wfProfileIn( __METHOD__.'-strtr' );
00463                         $result = strtr( $subject, $this->data );
00464                         wfProfileOut( __METHOD__.'-strtr' );
00465                 }
00466                 return $result;
00467         }
00468 }
00469 
00479 class ExplodeIterator implements Iterator {
00480         // The subject string
00481         var $subject, $subjectLength;
00482 
00483         // The delimiter
00484         var $delim, $delimLength;
00485 
00486         // The position of the start of the line
00487         var $curPos;
00488 
00489         // The position after the end of the next delimiter
00490         var $endPos;
00491 
00492         // The current token
00493         var $current;
00494 
00500         function __construct( $delim, $s ) {
00501                 $this->subject = $s;
00502                 $this->delim = $delim;
00503 
00504                 // Micro-optimisation (theoretical)
00505                 $this->subjectLength = strlen( $s );
00506                 $this->delimLength = strlen( $delim );
00507 
00508                 $this->rewind();
00509         }
00510 
00511         function rewind() {
00512                 $this->curPos = 0;
00513                 $this->endPos = strpos( $this->subject, $this->delim );
00514                 $this->refreshCurrent();
00515         }
00516 
00517         function refreshCurrent() {
00518                 if ( $this->curPos === false ) {
00519                         $this->current = false;
00520                 } elseif ( $this->curPos >= $this->subjectLength ) {
00521                         $this->current = '';
00522                 } elseif ( $this->endPos === false ) {
00523                         $this->current = substr( $this->subject, $this->curPos );
00524                 } else {
00525                         $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos );
00526                 }
00527         }
00528 
00529         function current() {
00530                 return $this->current;
00531         }
00532 
00533         function key() {
00534                 return $this->curPos;
00535         }
00536 
00540         function next() {
00541                 if ( $this->endPos === false ) {
00542                         $this->curPos = false;
00543                 } else {
00544                         $this->curPos = $this->endPos + $this->delimLength;
00545                         if ( $this->curPos >= $this->subjectLength ) {
00546                                 $this->endPos = false;
00547                         } else {
00548                                 $this->endPos = strpos( $this->subject, $this->delim, $this->curPos );
00549                         }
00550                 }
00551                 $this->refreshCurrent();
00552                 return $this->current;
00553         }
00554 
00558         function valid() {
00559                 return $this->curPos !== false;
00560         }
00561 }