PHPXRef 0.7.1 : MediaWiki-1.24.0 : /includes/utils/StringUtils.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  /**
   3   * Methods to play with strings.
   4   *
   5   * This program is free software; you can redistribute it and/or modify
   6   * it under the terms of the GNU General Public License as published by
   7   * the Free Software Foundation; either version 2 of the License, or
   8   * (at your option) any later version.
   9   *
  10   * This program is distributed in the hope that it will be useful,
  11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13   * GNU General Public License for more details.
  14   *
  15   * You should have received a copy of the GNU General Public License along
  16   * with this program; if not, write to the Free Software Foundation, Inc.,
  17   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18   * http://www.gnu.org/copyleft/gpl.html
  19   *
  20   * @file
  21   */
  22  
  23  /**
  24   * A collection of static methods to play with strings.
  25   */
  26  class StringUtils {
  27      /**
  28       * Test whether a string is valid UTF-8.
  29       *
  30       * The function check for invalid byte sequences, overlong encoding but
  31       * not for different normalisations.
  32       *
  33       * This relies internally on the mbstring function mb_check_encoding()
  34       * hardcoded to check against UTF-8. Whenever the function is not available
  35       * we fallback to a pure PHP implementation. Setting $disableMbstring to
  36       * true will skip the use of mb_check_encoding, this is mostly intended for
  37       * unit testing our internal implementation.
  38       *
  39       * @since 1.21
  40       * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
  41       * In particular, the pure PHP code path did not in fact check for overlong forms.
  42       * Beware of this when backporting code to that version of MediaWiki.
  43       *
  44       * @param string $value String to check
  45       * @param bool $disableMbstring Whether to use the pure PHP
  46       * implementation instead of trying mb_check_encoding. Intended for unit
  47       * testing. Default: false
  48       *
  49       * @return bool Whether the given $value is a valid UTF-8 encoded string
  50       */
  51  	static function isUtf8( $value, $disableMbstring = false ) {
  52          $value = (string)$value;
  53  
  54          // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
  55          // U+10FFFF are incorrectly allowed, so we have to check for them separately.
  56          if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
  57              static $newPHP;
  58              if ( $newPHP === null ) {
  59                  $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
  60              }
  61  
  62              return mb_check_encoding( $value, 'UTF-8' ) &&
  63                  ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
  64          }
  65  
  66          if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
  67              // String contains only ASCII characters, has to be valid
  68              return true;
  69          }
  70  
  71          // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
  72          // for large input, we check for invalid sequences (<= 5 bytes) rather than valid
  73          // sequences, which can be as long as the input string is. Multiple short regexes are
  74          // used rather than a single long regex for performance.
  75          static $regexes;
  76          if ( $regexes === null ) {
  77              $cont = "[\x80-\xbf]";
  78              $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
  79              $regexes = array(
  80                  // Continuation byte at the start
  81                  "/^$cont/",
  82  
  83                  // ASCII byte followed by a continuation byte
  84                  "/[\\x00-\x7f]$cont/S",
  85  
  86                  // Illegal byte
  87                  "/[\xc0\xc1\xf5-\xff]/S",
  88  
  89                  // Invalid 2-byte sequence, or valid one then an extra continuation byte
  90                  "/[\xc2-\xdf](?!$cont$after)/S",
  91  
  92                  // Invalid 3-byte sequence, or valid one then an extra continuation byte
  93                  "/\xe0(?![\xa0-\xbf]$cont$after)/",
  94                  "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
  95                  "/\xed(?![\x80-\x9f]$cont$after)/",
  96  
  97                  // Invalid 4-byte sequence, or valid one then an extra continuation byte
  98                  "/\xf0(?![\x90-\xbf]$cont{2}$after)/",
  99                  "/[\xf1-\xf3](?!$cont{3}$after)/S",
 100                  "/\xf4(?![\x80-\x8f]$cont{2}$after)/",
 101              );
 102          }
 103  
 104          foreach ( $regexes as $regex ) {
 105              if ( preg_match( $regex, $value ) !== 0 ) {
 106                  return false;
 107              }
 108          }
 109  
 110          return true;
 111      }
 112  
 113      /**
 114       * Perform an operation equivalent to
 115       *
 116       *     preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
 117       *
 118       * except that it's worst-case O(N) instead of O(N^2)
 119       *
 120       * Compared to delimiterReplace(), this implementation is fast but memory-
 121       * hungry and inflexible. The memory requirements are such that I don't
 122       * recommend using it on anything but guaranteed small chunks of text.
 123       *
 124       * @param string $startDelim
 125       * @param string $endDelim
 126       * @param string $replace
 127       * @param string $subject
 128       *
 129       * @return string
 130       */
 131  	static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
 132          $segments = explode( $startDelim, $subject );
 133          $output = array_shift( $segments );
 134          foreach ( $segments as $s ) {
 135              $endDelimPos = strpos( $s, $endDelim );
 136              if ( $endDelimPos === false ) {
 137                  $output .= $startDelim . $s;
 138              } else {
 139                  $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
 140              }
 141          }
 142  
 143          return $output;
 144      }
 145  
 146      /**
 147       * Perform an operation equivalent to
 148       *
 149       *   preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject )
 150       *
 151       * This implementation is slower than hungryDelimiterReplace but uses far less
 152       * memory. The delimiters are literal strings, not regular expressions.
 153       *
 154       * If the start delimiter ends with an initial substring of the end delimiter,
 155       * e.g. in the case of C-style comments, the behavior differs from the model
 156       * regex. In this implementation, the end must share no characters with the
 157       * start, so e.g. /*\/ is not considered to be both the start and end of a
 158       * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/.
 159       *
 160       * @param string $startDelim Start delimiter
 161       * @param string $endDelim End delimiter
 162       * @param callable $callback Function to call on each match
 163       * @param string $subject
 164       * @param string $flags Regular expression flags
 165       * @throws MWException
 166       * @return string
 167       */
 168  	static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
 169          $subject, $flags = ''
 170      ) {
 171          $inputPos = 0;
 172          $outputPos = 0;
 173          $output = '';
 174          $foundStart = false;
 175          $encStart = preg_quote( $startDelim, '!' );
 176          $encEnd = preg_quote( $endDelim, '!' );
 177          $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
 178          $endLength = strlen( $endDelim );
 179          $m = array();
 180  
 181          while ( $inputPos < strlen( $subject ) &&
 182              preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
 183          ) {
 184              $tokenOffset = $m[0][1];
 185              if ( $m[1][0] != '' ) {
 186                  if ( $foundStart &&
 187                      $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
 188                  ) {
 189                      # An end match is present at the same location
 190                      $tokenType = 'end';
 191                      $tokenLength = $endLength;
 192                  } else {
 193                      $tokenType = 'start';
 194                      $tokenLength = strlen( $m[0][0] );
 195                  }
 196              } elseif ( $m[2][0] != '' ) {
 197                  $tokenType = 'end';
 198                  $tokenLength = strlen( $m[0][0] );
 199              } else {
 200                  throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
 201              }
 202  
 203              if ( $tokenType == 'start' ) {
 204                  # Only move the start position if we haven't already found a start
 205                  # This means that START START END matches outer pair
 206                  if ( !$foundStart ) {
 207                      # Found start
 208                      $inputPos = $tokenOffset + $tokenLength;
 209                      # Write out the non-matching section
 210                      $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
 211                      $outputPos = $tokenOffset;
 212                      $contentPos = $inputPos;
 213                      $foundStart = true;
 214                  } else {
 215                      # Move the input position past the *first character* of START,
 216                      # to protect against missing END when it overlaps with START
 217                      $inputPos = $tokenOffset + 1;
 218                  }
 219              } elseif ( $tokenType == 'end' ) {
 220                  if ( $foundStart ) {
 221                      # Found match
 222                      $output .= call_user_func( $callback, array(
 223                          substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
 224                          substr( $subject, $contentPos, $tokenOffset - $contentPos )
 225                      ) );
 226                      $foundStart = false;
 227                  } else {
 228                      # Non-matching end, write it out
 229                      $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
 230                  }
 231                  $inputPos = $outputPos = $tokenOffset + $tokenLength;
 232              } else {
 233                  throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
 234              }
 235          }
 236          if ( $outputPos < strlen( $subject ) ) {
 237              $output .= substr( $subject, $outputPos );
 238          }
 239  
 240          return $output;
 241      }
 242  
 243      /**
 244       * Perform an operation equivalent to
 245       *
 246       *   preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
 247       *
 248       * @param string $startDelim Start delimiter regular expression
 249       * @param string $endDelim End delimiter regular expression
 250       * @param string $replace Replacement string. May contain $1, which will be
 251       *                 replaced by the text between the delimiters
 252       * @param string $subject String to search
 253       * @param string $flags Regular expression flags
 254       * @return string The string with the matches replaced
 255       */
 256  	static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
 257          $replacer = new RegexlikeReplacer( $replace );
 258  
 259          return self::delimiterReplaceCallback( $startDelim, $endDelim,
 260              $replacer->cb(), $subject, $flags );
 261      }
 262  
 263      /**
 264       * More or less "markup-safe" explode()
 265       * Ignores any instances of the separator inside <...>
 266       * @param string $separator
 267       * @param string $text
 268       * @return array
 269       */
 270  	static function explodeMarkup( $separator, $text ) {
 271          $placeholder = "\x00";
 272  
 273          // Remove placeholder instances
 274          $text = str_replace( $placeholder, '', $text );
 275  
 276          // Replace instances of the separator inside HTML-like tags with the placeholder
 277          $replacer = new DoubleReplacer( $separator, $placeholder );
 278          $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
 279  
 280          // Explode, then put the replaced separators back in
 281          $items = explode( $separator, $cleaned );
 282          foreach ( $items as $i => $str ) {
 283              $items[$i] = str_replace( $placeholder, $separator, $str );
 284          }
 285  
 286          return $items;
 287      }
 288  
 289      /**
 290       * Escape a string to make it suitable for inclusion in a preg_replace()
 291       * replacement parameter.
 292       *
 293       * @param string $string
 294       * @return string
 295       */
 296  	static function escapeRegexReplacement( $string ) {
 297          $string = str_replace( '\\', '\\\\', $string );
 298          $string = str_replace( '$', '\\$', $string );
 299  
 300          return $string;
 301      }
 302  
 303      /**
 304       * Workalike for explode() with limited memory usage.
 305       * Returns an Iterator
 306       * @param string $separator
 307       * @param string $subject
 308       * @return ArrayIterator|ExplodeIterator
 309       */
 310  	static function explode( $separator, $subject ) {
 311          if ( substr_count( $subject, $separator ) > 1000 ) {
 312              return new ExplodeIterator( $separator, $subject );
 313          } else {
 314              return new ArrayIterator( explode( $separator, $subject ) );
 315          }
 316      }
 317  }
 318  
 319  /**
 320   * Base class for "replacers", objects used in preg_replace_callback() and
 321   * StringUtils::delimiterReplaceCallback()
 322   */
 323  class Replacer {
 324      /**
 325       * @return array
 326       */
 327      function cb() {
 328          return array( &$this, 'replace' );
 329      }
 330  }
 331  
 332  /**
 333   * Class to replace regex matches with a string similar to that used in preg_replace()
 334   */
 335  class RegexlikeReplacer extends Replacer {
 336      private $r;
 337  
 338      /**
 339       * @param string $r
 340       */
 341  	function __construct( $r ) {
 342          $this->r = $r;
 343      }
 344  
 345      /**
 346       * @param array $matches
 347       * @return string
 348       */
 349  	function replace( $matches ) {
 350          $pairs = array();
 351          foreach ( $matches as $i => $match ) {
 352              $pairs["\$$i"] = $match;
 353          }
 354  
 355          return strtr( $this->r, $pairs );
 356      }
 357  }
 358  
 359  /**
 360   * Class to perform secondary replacement within each replacement string
 361   */
 362  class DoubleReplacer extends Replacer {
 363      /**
 364       * @param mixed $from
 365       * @param mixed $to
 366       * @param int $index
 367       */
 368  	function __construct( $from, $to, $index = 0 ) {
 369          $this->from = $from;
 370          $this->to = $to;
 371          $this->index = $index;
 372      }
 373  
 374      /**
 375       * @param array $matches
 376       * @return mixed
 377       */
 378  	function replace( $matches ) {
 379          return str_replace( $this->from, $this->to, $matches[$this->index] );
 380      }
 381  }
 382  
 383  /**
 384   * Class to perform replacement based on a simple hashtable lookup
 385   */
 386  class HashtableReplacer extends Replacer {
 387      private $table, $index;
 388  
 389      /**
 390       * @param array $table
 391       * @param int $index
 392       */
 393  	function __construct( $table, $index = 0 ) {
 394          $this->table = $table;
 395          $this->index = $index;
 396      }
 397  
 398      /**
 399       * @param array $matches
 400       * @return mixed
 401       */
 402  	function replace( $matches ) {
 403          return $this->table[$matches[$this->index]];
 404      }
 405  }
 406  
 407  /**
 408   * Replacement array for FSS with fallback to strtr()
 409   * Supports lazy initialisation of FSS resource
 410   */
 411  class ReplacementArray {
 412      private $data = false;
 413      private $fss = false;
 414  
 415      /**
 416       * Create an object with the specified replacement array
 417       * The array should have the same form as the replacement array for strtr()
 418       * @param array $data
 419       */
 420  	function __construct( $data = array() ) {
 421          $this->data = $data;
 422      }
 423  
 424      /**
 425       * @return array
 426       */
 427  	function __sleep() {
 428          return array( 'data' );
 429      }
 430  
 431  	function __wakeup() {
 432          $this->fss = false;
 433      }
 434  
 435      /**
 436       * Set the whole replacement array at once
 437       * @param array $data
 438       */
 439  	function setArray( $data ) {
 440          $this->data = $data;
 441          $this->fss = false;
 442      }
 443  
 444      /**
 445       * @return array|bool
 446       */
 447  	function getArray() {
 448          return $this->data;
 449      }
 450  
 451      /**
 452       * Set an element of the replacement array
 453       * @param string $from
 454       * @param string $to
 455       */
 456  	function setPair( $from, $to ) {
 457          $this->data[$from] = $to;
 458          $this->fss = false;
 459      }
 460  
 461      /**
 462       * @param array $data
 463       */
 464  	function mergeArray( $data ) {
 465          $this->data = array_merge( $this->data, $data );
 466          $this->fss = false;
 467      }
 468  
 469      /**
 470       * @param ReplacementArray $other
 471       */
 472  	function merge( $other ) {
 473          $this->data = array_merge( $this->data, $other->data );
 474          $this->fss = false;
 475      }
 476  
 477      /**
 478       * @param string $from
 479       */
 480  	function removePair( $from ) {
 481          unset( $this->data[$from] );
 482          $this->fss = false;
 483      }
 484  
 485      /**
 486       * @param array $data
 487       */
 488  	function removeArray( $data ) {
 489          foreach ( $data as $from => $to ) {
 490              $this->removePair( $from );
 491          }
 492          $this->fss = false;
 493      }
 494  
 495      /**
 496       * @param string $subject
 497       * @return string
 498       */
 499  	function replace( $subject ) {
 500          if ( function_exists( 'fss_prep_replace' ) ) {
 501              wfProfileIn( __METHOD__ . '-fss' );
 502              if ( $this->fss === false ) {
 503                  $this->fss = fss_prep_replace( $this->data );
 504              }
 505              $result = fss_exec_replace( $this->fss, $subject );
 506              wfProfileOut( __METHOD__ . '-fss' );
 507          } else {
 508              wfProfileIn( __METHOD__ . '-strtr' );
 509              $result = strtr( $subject, $this->data );
 510              wfProfileOut( __METHOD__ . '-strtr' );
 511          }
 512  
 513          return $result;
 514      }
 515  }
 516  
 517  /**
 518   * An iterator which works exactly like:
 519   *
 520   * foreach ( explode( $delim, $s ) as $element ) {
 521   *    ...
 522   * }
 523   *
 524   * Except it doesn't use 193 byte per element
 525   */
 526  class ExplodeIterator implements Iterator {
 527      // The subject string
 528      private $subject, $subjectLength;
 529  
 530      // The delimiter
 531      private $delim, $delimLength;
 532  
 533      // The position of the start of the line
 534      private $curPos;
 535  
 536      // The position after the end of the next delimiter
 537      private $endPos;
 538  
 539      // The current token
 540      private $current;
 541  
 542      /**
 543       * Construct a DelimIterator
 544       * @param string $delim
 545       * @param string $subject
 546       */
 547  	function __construct( $delim, $subject ) {
 548          $this->subject = $subject;
 549          $this->delim = $delim;
 550  
 551          // Micro-optimisation (theoretical)
 552          $this->subjectLength = strlen( $subject );
 553          $this->delimLength = strlen( $delim );
 554  
 555          $this->rewind();
 556      }
 557  
 558  	function rewind() {
 559          $this->curPos = 0;
 560          $this->endPos = strpos( $this->subject, $this->delim );
 561          $this->refreshCurrent();
 562      }
 563  
 564  	function refreshCurrent() {
 565          if ( $this->curPos === false ) {
 566              $this->current = false;
 567          } elseif ( $this->curPos >= $this->subjectLength ) {
 568              $this->current = '';
 569          } elseif ( $this->endPos === false ) {
 570              $this->current = substr( $this->subject, $this->curPos );
 571          } else {
 572              $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos );
 573          }
 574      }
 575  
 576  	function current() {
 577          return $this->current;
 578      }
 579  
 580      /**
 581       * @return int|bool Current position or boolean false if invalid
 582       */
 583  	function key() {
 584          return $this->curPos;
 585      }
 586  
 587      /**
 588       * @return string
 589       */
 590  	function next() {
 591          if ( $this->endPos === false ) {
 592              $this->curPos = false;
 593          } else {
 594              $this->curPos = $this->endPos + $this->delimLength;
 595              if ( $this->curPos >= $this->subjectLength ) {
 596                  $this->endPos = false;
 597              } else {
 598                  $this->endPos = strpos( $this->subject, $this->delim, $this->curPos );
 599              }
 600          }
 601          $this->refreshCurrent();
 602  
 603          return $this->current;
 604      }
 605  
 606      /**
 607       * @return bool
 608       */
 609  	function valid() {
 610          return $this->curPos !== false;
 611      }
 612  }
PHP Cross Reference of MediaWiki-1.24.0

/includes/utils/ -> StringUtils.php (source)