php/html/StringUtils_8php_source.html

00001 <?php
00026 class StringUtils {
00051     static function isUtf8( $value, $disableMbstring = false ) {
00052         $value = (string)$value;
00053
00054         // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
00055         // U+10FFFF are incorrectly allowed, so we have to check for them separately.
00056         if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
00057             static $newPHP;
00058             if ( $newPHP === null ) {
00059                 $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
00060             }
00061
00062             return mb_check_encoding( $value, 'UTF-8' ) &&
00063                 ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
00064         }
00065
00066         if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
00067             // String contains only ASCII characters, has to be valid
00068             return true;
00069         }
00070
00071         // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
00072         // for large input, we check for invalid sequences (<= 5 bytes) rather than valid
00073         // sequences, which can be as long as the input string is. Multiple short regexes are
00074         // used rather than a single long regex for performance.
00075         static $regexes;
00076         if ( $regexes === null ) {
00077             $cont = "[\x80-\xbf]";
00078             $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
00079             $regexes = array(
00080                 // Continuation byte at the start
00081                 "/^$cont/",
00082
00083                 // ASCII byte followed by a continuation byte
00084                 "/[\\x00-\x7f]$cont/S",
00085
00086                 // Illegal byte
00087                 "/[\xc0\xc1\xf5-\xff]/S",
00088
00089                 // Invalid 2-byte sequence, or valid one then an extra continuation byte
00090                 "/[\xc2-\xdf](?!$cont$after)/S",
00091
00092                 // Invalid 3-byte sequence, or valid one then an extra continuation byte
00093                 "/\xe0(?![\xa0-\xbf]$cont$after)/",
00094                 "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
00095                 "/\xed(?![\x80-\x9f]$cont$after)/",
00096
00097                 // Invalid 4-byte sequence, or valid one then an extra continuation byte
00098                 "/\xf0(?![\x90-\xbf]$cont{2}$after)/",
00099                 "/[\xf1-\xf3](?!$cont{3}$after)/S",
00100                 "/\xf4(?![\x80-\x8f]$cont{2}$after)/",
00101             );
00102         }
00103
00104         foreach ( $regexes as $regex ) {
00105             if ( preg_match( $regex, $value ) !== 0 ) {
00106                 return false;
00107             }
00108         }
00109
00110         return true;
00111     }
00112
00131     static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
00132         $segments = explode( $startDelim, $subject );
00133         $output = array_shift( $segments );
00134         foreach ( $segments as $s ) {
00135             $endDelimPos = strpos( $s, $endDelim );
00136             if ( $endDelimPos === false ) {
00137                 $output .= $startDelim . $s;
00138             } else {
00139                 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
00140             }
00141         }
00142
00143         return $output;
00144     }
00145
00168     static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
00169         $subject, $flags = ''
00170     ) {
00171         $inputPos = 0;
00172         $outputPos = 0;
00173         $output = '';
00174         $foundStart = false;
00175         $encStart = preg_quote( $startDelim, '!' );
00176         $encEnd = preg_quote( $endDelim, '!' );
00177         $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
00178         $endLength = strlen( $endDelim );
00179         $m = array();
00180
00181         while ( $inputPos < strlen( $subject ) &&
00182             preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
00183         ) {
00184             $tokenOffset = $m[0][1];
00185             if ( $m[1][0] != '' ) {
00186                 if ( $foundStart &&
00187                     $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
00188                 ) {
00189                     # An end match is present at the same location
00190                     $tokenType = 'end';
00191                     $tokenLength = $endLength;
00192                 } else {
00193                     $tokenType = 'start';
00194                     $tokenLength = strlen( $m[0][0] );
00195                 }
00196             } elseif ( $m[2][0] != '' ) {
00197                 $tokenType = 'end';
00198                 $tokenLength = strlen( $m[0][0] );
00199             } else {
00200                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
00201             }
00202
00203             if ( $tokenType == 'start' ) {
00204                 # Only move the start position if we haven't already found a start
00205                 # This means that START START END matches outer pair
00206                 if ( !$foundStart ) {
00207                     # Found start
00208                     $inputPos = $tokenOffset + $tokenLength;
00209                     # Write out the non-matching section
00210                     $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
00211                     $outputPos = $tokenOffset;
00212                     $contentPos = $inputPos;
00213                     $foundStart = true;
00214                 } else {
00215                     # Move the input position past the *first character* of START,
00216                     # to protect against missing END when it overlaps with START
00217                     $inputPos = $tokenOffset + 1;
00218                 }
00219             } elseif ( $tokenType == 'end' ) {
00220                 if ( $foundStart ) {
00221                     # Found match
00222                     $output .= call_user_func( $callback, array(
00223                         substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
00224                         substr( $subject, $contentPos, $tokenOffset - $contentPos )
00225                     ) );
00226                     $foundStart = false;
00227                 } else {
00228                     # Non-matching end, write it out
00229                     $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
00230                 }
00231                 $inputPos = $outputPos = $tokenOffset + $tokenLength;
00232             } else {
00233                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
00234             }
00235         }
00236         if ( $outputPos < strlen( $subject ) ) {
00237             $output .= substr( $subject, $outputPos );
00238         }
00239
00240         return $output;
00241     }
00242
00256     static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
00257         $replacer = new RegexlikeReplacer( $replace );
00258
00259         return self::delimiterReplaceCallback( $startDelim, $endDelim,
00260             $replacer->cb(), $subject, $flags );
00261     }
00262
00270     static function explodeMarkup( $separator, $text ) {
00271         $placeholder = "\x00";
00272
00273         // Remove placeholder instances
00274         $text = str_replace( $placeholder, '', $text );
00275
00276         // Replace instances of the separator inside HTML-like tags with the placeholder
00277         $replacer = new DoubleReplacer( $separator, $placeholder );
00278         $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
00279
00280         // Explode, then put the replaced separators back in
00281         $items = explode( $separator, $cleaned );
00282         foreach ( $items as $i => $str ) {
00283             $items[$i] = str_replace( $placeholder, $separator, $str );
00284         }
00285
00286         return $items;
00287     }
00288
00296     static function escapeRegexReplacement( $string ) {
00297         $string = str_replace( '\\', '\\\\', $string );
00298         $string = str_replace( '$', '\\$', $string );
00299
00300         return $string;
00301     }
00302
00310     static function explode( $separator, $subject ) {
00311         if ( substr_count( $subject, $separator ) > 1000 ) {
00312             return new ExplodeIterator( $separator, $subject );
00313         } else {
00314             return new ArrayIterator( explode( $separator, $subject ) );
00315         }
00316     }
00317 }
00318
00323 class Replacer {
00327     function cb() {
00328         return array( &$this, 'replace' );
00329     }
00330 }
00331
00335 class RegexlikeReplacer extends Replacer {
00336     private $r;
00337
00341     function __construct( $r ) {
00342         $this->r = $r;
00343     }
00344
00349     function replace( $matches ) {
00350         $pairs = array();
00351         foreach ( $matches as $i => $match ) {
00352             $pairs["\$$i"] = $match;
00353         }
00354
00355         return strtr( $this->r, $pairs );
00356     }
00357 }
00358
00362 class DoubleReplacer extends Replacer {
00368     function __construct( $from, $to, $index = 0 ) {
00369         $this->from = $from;
00370         $this->to = $to;
00371         $this->index = $index;
00372     }
00373
00378     function replace( $matches ) {
00379         return str_replace( $this->from, $this->to, $matches[$this->index] );
00380     }
00381 }
00382
00386 class HashtableReplacer extends Replacer {
00387     private $table, $index;
00388
00393     function __construct( $table, $index = 0 ) {
00394         $this->table = $table;
00395         $this->index = $index;
00396     }
00397
00402     function replace( $matches ) {
00403         return $this->table[$matches[$this->index]];
00404     }
00405 }
00406
00411 class ReplacementArray {
00412     private $data = false;
00413     private $fss = false;
00414
00420     function __construct( $data = array() ) {
00421         $this->data = $data;
00422     }
00423
00427     function __sleep() {
00428         return array( 'data' );
00429     }
00430
00431     function __wakeup() {
00432         $this->fss = false;
00433     }
00434
00439     function setArray( $data ) {
00440         $this->data = $data;
00441         $this->fss = false;
00442     }
00443
00447     function getArray() {
00448         return $this->data;
00449     }
00450
00456     function setPair( $from, $to ) {
00457         $this->data[$from] = $to;
00458         $this->fss = false;
00459     }
00460
00464     function mergeArray( $data ) {
00465         $this->data = array_merge( $this->data, $data );
00466         $this->fss = false;
00467     }
00468
00472     function merge( $other ) {
00473         $this->data = array_merge( $this->data, $other->data );
00474         $this->fss = false;
00475     }
00476
00480     function removePair( $from ) {
00481         unset( $this->data[$from] );
00482         $this->fss = false;
00483     }
00484
00488     function removeArray( $data ) {
00489         foreach ( $data as $from => $to ) {
00490             $this->removePair( $from );
00491         }
00492         $this->fss = false;
00493     }
00494
00499     function replace( $subject ) {
00500         if ( function_exists( 'fss_prep_replace' ) ) {
00501             wfProfileIn( __METHOD__ . '-fss' );
00502             if ( $this->fss === false ) {
00503                 $this->fss = fss_prep_replace( $this->data );
00504             }
00505             $result = fss_exec_replace( $this->fss, $subject );
00506             wfProfileOut( __METHOD__ . '-fss' );
00507         } else {
00508             wfProfileIn( __METHOD__ . '-strtr' );
00509             $result = strtr( $subject, $this->data );
00510             wfProfileOut( __METHOD__ . '-strtr' );
00511         }
00512
00513         return $result;
00514     }
00515 }
00516
00526 class ExplodeIterator implements Iterator {
00527     // The subject string
00528     private $subject, $subjectLength;
00529
00530     // The delimiter
00531     private $delim, $delimLength;
00532
00533     // The position of the start of the line
00534     private $curPos;
00535
00536     // The position after the end of the next delimiter
00537     private $endPos;
00538
00539     // The current token
00540     private $current;
00541
00547     function __construct( $delim, $subject ) {
00548         $this->subject = $subject;
00549         $this->delim = $delim;
00550
00551         // Micro-optimisation (theoretical)
00552         $this->subjectLength = strlen( $subject );
00553         $this->delimLength = strlen( $delim );
00554
00555         $this->rewind();
00556     }
00557
00558     function rewind() {
00559         $this->curPos = 0;
00560         $this->endPos = strpos( $this->subject, $this->delim );
00561         $this->refreshCurrent();
00562     }
00563
00564     function refreshCurrent() {
00565         if ( $this->curPos === false ) {
00566             $this->current = false;
00567         } elseif ( $this->curPos >= $this->subjectLength ) {
00568             $this->current = '';
00569         } elseif ( $this->endPos === false ) {
00570             $this->current = substr( $this->subject, $this->curPos );
00571         } else {
00572             $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos );
00573         }
00574     }
00575
00576     function current() {
00577         return $this->current;
00578     }
00579
00583     function key() {
00584         return $this->curPos;
00585     }
00586
00590     function next() {
00591         if ( $this->endPos === false ) {
00592             $this->curPos = false;
00593         } else {
00594             $this->curPos = $this->endPos + $this->delimLength;
00595             if ( $this->curPos >= $this->subjectLength ) {
00596                 $this->endPos = false;
00597             } else {
00598                 $this->endPos = strpos( $this->subject, $this->delim, $this->curPos );
00599             }
00600         }
00601         $this->refreshCurrent();
00602
00603         return $this->current;
00604     }
00605
00609     function valid() {
00610         return $this->curPos !== false;
00611     }
00612 }