php/html/StringUtils_8php_source.html

00001 <?php
00026 class StringUtils {
00027
00052     static function isUtf8( $value, $disableMbstring = false ) {
00053         $value = (string)$value;
00054
00055         // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
00056         // U+10FFFF are incorrectly allowed, so we have to check for them separately.
00057         if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
00058             static $newPHP;
00059             if ( $newPHP === null ) {
00060                 $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
00061             }
00062
00063             return mb_check_encoding( $value, 'UTF-8' ) &&
00064                 ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
00065         }
00066
00067         if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
00068             // String contains only ASCII characters, has to be valid
00069             return true;
00070         }
00071
00072         // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
00073         // for large input, we check for invalid sequences (<= 5 bytes) rather than valid
00074         // sequences, which can be as long as the input string is. Multiple short regexes are
00075         // used rather than a single long regex for performance.
00076         static $regexes;
00077         if ( $regexes === null ) {
00078             $cont = "[\x80-\xbf]";
00079             $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
00080             $regexes = array(
00081                 // Continuation byte at the start
00082                 "/^$cont/",
00083
00084                 // ASCII byte followed by a continuation byte
00085                 "/[\\x00-\x7f]$cont/S",
00086
00087                 // Illegal byte
00088                 "/[\xc0\xc1\xf5-\xff]/S",
00089
00090                 // Invalid 2-byte sequence, or valid one then an extra continuation byte
00091                 "/[\xc2-\xdf](?!$cont$after)/S",
00092
00093                 // Invalid 3-byte sequence, or valid one then an extra continuation byte
00094                 "/\xe0(?![\xa0-\xbf]$cont$after)/",
00095                 "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
00096                 "/\xed(?![\x80-\x9f]$cont$after)/",
00097
00098                 // Invalid 4-byte sequence, or valid one then an extra continuation byte
00099                 "/\xf0(?![\x90-\xbf]$cont{2}$after)/",
00100                 "/[\xf1-\xf3](?!$cont{3}$after)/S",
00101                 "/\xf4(?![\x80-\x8f]$cont{2}$after)/",
00102             );
00103         }
00104
00105         foreach ( $regexes as $regex ) {
00106             if ( preg_match( $regex, $value ) !== 0 ) {
00107                 return false;
00108             }
00109         }
00110         return true;
00111     }
00112
00131     static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
00132         $segments = explode( $startDelim, $subject );
00133         $output = array_shift( $segments );
00134         foreach ( $segments as $s ) {
00135             $endDelimPos = strpos( $s, $endDelim );
00136             if ( $endDelimPos === false ) {
00137                 $output .= $startDelim . $s;
00138             } else {
00139                 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
00140             }
00141         }
00142         return $output;
00143     }
00144
00167     static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, $subject, $flags = '' ) {
00168         $inputPos = 0;
00169         $outputPos = 0;
00170         $output = '';
00171         $foundStart = false;
00172         $encStart = preg_quote( $startDelim, '!' );
00173         $encEnd = preg_quote( $endDelim, '!' );
00174         $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
00175         $endLength = strlen( $endDelim );
00176         $m = array();
00177
00178         while ( $inputPos < strlen( $subject ) &&
00179             preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) )
00180         {
00181             $tokenOffset = $m[0][1];
00182             if ( $m[1][0] != '' ) {
00183                 if ( $foundStart &&
00184                     $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 )
00185                 {
00186                     # An end match is present at the same location
00187                     $tokenType = 'end';
00188                     $tokenLength = $endLength;
00189                 } else {
00190                     $tokenType = 'start';
00191                     $tokenLength = strlen( $m[0][0] );
00192                 }
00193             } elseif ( $m[2][0] != '' ) {
00194                 $tokenType = 'end';
00195                 $tokenLength = strlen( $m[0][0] );
00196             } else {
00197                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
00198             }
00199
00200             if ( $tokenType == 'start' ) {
00201                 # Only move the start position if we haven't already found a start
00202                 # This means that START START END matches outer pair
00203                 if ( !$foundStart ) {
00204                     # Found start
00205                     $inputPos = $tokenOffset + $tokenLength;
00206                     # Write out the non-matching section
00207                     $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
00208                     $outputPos = $tokenOffset;
00209                     $contentPos = $inputPos;
00210                     $foundStart = true;
00211                 } else {
00212                     # Move the input position past the *first character* of START,
00213                     # to protect against missing END when it overlaps with START
00214                     $inputPos = $tokenOffset + 1;
00215                 }
00216             } elseif ( $tokenType == 'end' ) {
00217                 if ( $foundStart ) {
00218                     # Found match
00219                     $output .= call_user_func( $callback, array(
00220                         substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
00221                         substr( $subject, $contentPos, $tokenOffset - $contentPos )
00222                     ));
00223                     $foundStart = false;
00224                 } else {
00225                     # Non-matching end, write it out
00226                     $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
00227                 }
00228                 $inputPos = $outputPos = $tokenOffset + $tokenLength;
00229             } else {
00230                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
00231             }
00232         }
00233         if ( $outputPos < strlen( $subject ) ) {
00234             $output .= substr( $subject, $outputPos );
00235         }
00236         return $output;
00237     }
00238
00252     static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
00253         $replacer = new RegexlikeReplacer( $replace );
00254         return self::delimiterReplaceCallback( $startDelim, $endDelim,
00255             $replacer->cb(), $subject, $flags );
00256     }
00257
00265     static function explodeMarkup( $separator, $text ) {
00266         $placeholder = "\x00";
00267
00268         // Remove placeholder instances
00269         $text = str_replace( $placeholder, '', $text );
00270
00271         // Replace instances of the separator inside HTML-like tags with the placeholder
00272         $replacer = new DoubleReplacer( $separator, $placeholder );
00273         $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
00274
00275         // Explode, then put the replaced separators back in
00276         $items = explode( $separator, $cleaned );
00277         foreach ( $items as $i => $str ) {
00278             $items[$i] = str_replace( $placeholder, $separator, $str );
00279         }
00280
00281         return $items;
00282     }
00283
00291     static function escapeRegexReplacement( $string ) {
00292         $string = str_replace( '\\', '\\\\', $string );
00293         $string = str_replace( '$', '\\$', $string );
00294         return $string;
00295     }
00296
00304     static function explode( $separator, $subject ) {
00305         if ( substr_count( $subject, $separator ) > 1000 ) {
00306             return new ExplodeIterator( $separator, $subject );
00307         } else {
00308             return new ArrayIterator( explode( $separator, $subject ) );
00309         }
00310     }
00311 }
00312
00317 class Replacer {
00318
00322     function cb() {
00323         return array( &$this, 'replace' );
00324     }
00325 }
00326
00330 class RegexlikeReplacer extends Replacer {
00331     var $r;
00332
00336     function __construct( $r ) {
00337         $this->r = $r;
00338     }
00339
00344     function replace( $matches ) {
00345         $pairs = array();
00346         foreach ( $matches as $i => $match ) {
00347             $pairs["\$$i"] = $match;
00348         }
00349         return strtr( $this->r, $pairs );
00350     }
00351
00352 }
00353
00357 class DoubleReplacer extends Replacer {
00358
00364     function __construct( $from, $to, $index = 0 ) {
00365         $this->from = $from;
00366         $this->to = $to;
00367         $this->index = $index;
00368     }
00369
00374     function replace( $matches ) {
00375         return str_replace( $this->from, $this->to, $matches[$this->index] );
00376     }
00377 }
00378
00382 class HashtableReplacer extends Replacer {
00383     var $table, $index;
00384
00389     function __construct( $table, $index = 0 ) {
00390         $this->table = $table;
00391         $this->index = $index;
00392     }
00393
00398     function replace( $matches ) {
00399         return $this->table[$matches[$this->index]];
00400     }
00401 }
00402
00407 class ReplacementArray {
00408     /*mostly private*/ var $data = false;
00409     /*mostly private*/ var $fss = false;
00410
00416     function __construct( $data = array() ) {
00417         $this->data = $data;
00418     }
00419
00423     function __sleep() {
00424         return array( 'data' );
00425     }
00426
00427     function __wakeup() {
00428         $this->fss = false;
00429     }
00430
00435     function setArray( $data ) {
00436         $this->data = $data;
00437         $this->fss = false;
00438     }
00439
00443     function getArray() {
00444         return $this->data;
00445     }
00446
00452     function setPair( $from, $to ) {
00453         $this->data[$from] = $to;
00454         $this->fss = false;
00455     }
00456
00460     function mergeArray( $data ) {
00461         $this->data = array_merge( $this->data, $data );
00462         $this->fss = false;
00463     }
00464
00468     function merge( $other ) {
00469         $this->data = array_merge( $this->data, $other->data );
00470         $this->fss = false;
00471     }
00472
00476     function removePair( $from ) {
00477         unset( $this->data[$from] );
00478         $this->fss = false;
00479     }
00480
00484     function removeArray( $data ) {
00485         foreach ( $data as $from => $to ) {
00486             $this->removePair( $from );
00487         }
00488         $this->fss = false;
00489     }
00490
00495     function replace( $subject ) {
00496         if ( function_exists( 'fss_prep_replace' ) ) {
00497             wfProfileIn( __METHOD__ . '-fss' );
00498             if ( $this->fss === false ) {
00499                 $this->fss = fss_prep_replace( $this->data );
00500             }
00501             $result = fss_exec_replace( $this->fss, $subject );
00502             wfProfileOut( __METHOD__ . '-fss' );
00503         } else {
00504             wfProfileIn( __METHOD__ . '-strtr' );
00505             $result = strtr( $subject, $this->data );
00506             wfProfileOut( __METHOD__ . '-strtr' );
00507         }
00508         return $result;
00509     }
00510 }
00511
00521 class ExplodeIterator implements Iterator {
00522     // The subject string
00523     var $subject, $subjectLength;
00524
00525     // The delimiter
00526     var $delim, $delimLength;
00527
00528     // The position of the start of the line
00529     var $curPos;
00530
00531     // The position after the end of the next delimiter
00532     var $endPos;
00533
00534     // The current token
00535     var $current;
00536
00542     function __construct( $delim, $subject ) {
00543         $this->subject = $subject;
00544         $this->delim = $delim;
00545
00546         // Micro-optimisation (theoretical)
00547         $this->subjectLength = strlen( $subject );
00548         $this->delimLength = strlen( $delim );
00549
00550         $this->rewind();
00551     }
00552
00553     function rewind() {
00554         $this->curPos = 0;
00555         $this->endPos = strpos( $this->subject, $this->delim );
00556         $this->refreshCurrent();
00557     }
00558
00559     function refreshCurrent() {
00560         if ( $this->curPos === false ) {
00561             $this->current = false;
00562         } elseif ( $this->curPos >= $this->subjectLength ) {
00563             $this->current = '';
00564         } elseif ( $this->endPos === false ) {
00565             $this->current = substr( $this->subject, $this->curPos );
00566         } else {
00567             $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos );
00568         }
00569     }
00570
00571     function current() {
00572         return $this->current;
00573     }
00574
00578     function key() {
00579         return $this->curPos;
00580     }
00581
00585     function next() {
00586         if ( $this->endPos === false ) {
00587             $this->curPos = false;
00588         } else {
00589             $this->curPos = $this->endPos + $this->delimLength;
00590             if ( $this->curPos >= $this->subjectLength ) {
00591                 $this->endPos = false;
00592             } else {
00593                 $this->endPos = strpos( $this->subject, $this->delim, $this->curPos );
00594             }
00595         }
00596         $this->refreshCurrent();
00597         return $this->current;
00598     }
00599
00603     function valid() {
00604         return $this->curPos !== false;
00605     }
00606 }