PHPXRef 0.7.1 : MediaWiki-1.24.0 : /includes/normal/UtfNormal.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  /**
   3   * Unicode normalization routines
   4   *
   5   * Copyright © 2004 Brion Vibber <[email protected]>
   6   * https://www.mediawiki.org/
   7   *
   8   * This program is free software; you can redistribute it and/or modify
   9   * it under the terms of the GNU General Public License as published by
  10   * the Free Software Foundation; either version 2 of the License, or
  11   * (at your option) any later version.
  12   *
  13   * This program is distributed in the hope that it will be useful,
  14   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16   * GNU General Public License for more details.
  17   *
  18   * You should have received a copy of the GNU General Public License along
  19   * with this program; if not, write to the Free Software Foundation, Inc.,
  20   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21   * http://www.gnu.org/copyleft/gpl.html
  22   *
  23   * @file
  24   * @ingroup UtfNormal
  25   */
  26  
  27  /**
  28   * @defgroup UtfNormal UtfNormal
  29   */
  30  
  31  define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
  32  define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
  33  
  34  /**
  35   * Unicode normalization routines for working with UTF-8 strings.
  36   * Currently assumes that input strings are valid UTF-8!
  37   *
  38   * Not as fast as I'd like, but should be usable for most purposes.
  39   * UtfNormal::toNFC() will bail early if given ASCII text or text
  40   * it can quickly determine is already normalized.
  41   *
  42   * All functions can be called static.
  43   *
  44   * See description of forms at http://www.unicode.org/reports/tr15/
  45   *
  46   * @ingroup UtfNormal
  47   */
  48  class UtfNormal {
  49      /**
  50       * For using the ICU wrapper
  51       */
  52      const UNORM_NONE = 1;
  53      const UNORM_NFD = 2;
  54      const UNORM_NFKD = 3;
  55      const UNORM_NFC = 4;
  56      const UNORM_NFKC = 5;
  57      const UNORM_FCD = 6;
  58      const UNORM_DEFAULT = self::UNORM_NFC;
  59  
  60      public static $utfCombiningClass = null;
  61      public static $utfCanonicalComp = null;
  62      public static $utfCanonicalDecomp = null;
  63  
  64      # Load compatibility decompositions on demand if they are needed.
  65      public static $utfCompatibilityDecomp = null;
  66      public static $utfCheckNFC;
  67  
  68      /**
  69       * The ultimate convenience function! Clean up invalid UTF-8 sequences,
  70       * and convert to normal form C, canonical composition.
  71       *
  72       * Fast return for pure ASCII strings; some lesser optimizations for
  73       * strings containing only known-good characters. Not as fast as toNFC().
  74       *
  75       * @param string $string a UTF-8 string
  76       * @return string a clean, shiny, normalized UTF-8 string
  77       */
  78  	static function cleanUp( $string ) {
  79          if ( NORMALIZE_ICU ) {
  80              $string = self::replaceForNativeNormalize( $string );
  81  
  82              # UnicodeString constructor fails if the string ends with a
  83              # head byte. Add a junk char at the end, we'll strip it off.
  84              return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" );
  85          } elseif ( NORMALIZE_INTL ) {
  86              $string = self::replaceForNativeNormalize( $string );
  87              $norm = normalizer_normalize( $string, Normalizer::FORM_C );
  88              if ( $norm === null || $norm === false ) {
  89                  # normalizer_normalize will either return false or null
  90                  # (depending on which doc you read) if invalid utf8 string.
  91                  # quickIsNFCVerify cleans up invalid sequences.
  92  
  93                  if ( UtfNormal::quickIsNFCVerify( $string ) ) {
  94                      # if that's true, the string is actually already normal.
  95                      return $string;
  96                  } else {
  97                      # Now we are valid but non-normal
  98                      return normalizer_normalize( $string, Normalizer::FORM_C );
  99                  }
 100              } else {
 101                  return $norm;
 102              }
 103          } elseif ( UtfNormal::quickIsNFCVerify( $string ) ) {
 104              # Side effect -- $string has had UTF-8 errors cleaned up.
 105              return $string;
 106          } else {
 107              return UtfNormal::NFC( $string );
 108          }
 109      }
 110  
 111      /**
 112       * Convert a UTF-8 string to normal form C, canonical composition.
 113       * Fast return for pure ASCII strings; some lesser optimizations for
 114       * strings containing only known-good characters.
 115       *
 116       * @param string $string a valid UTF-8 string. Input is not validated.
 117       * @return string a UTF-8 string in normal form C
 118       */
 119  	static function toNFC( $string ) {
 120          if ( NORMALIZE_INTL )
 121              return normalizer_normalize( $string, Normalizer::FORM_C );
 122          elseif ( NORMALIZE_ICU )
 123              return utf8_normalize( $string, self::UNORM_NFC );
 124          elseif ( UtfNormal::quickIsNFC( $string ) )
 125              return $string;
 126          else
 127              return UtfNormal::NFC( $string );
 128      }
 129  
 130      /**
 131       * Convert a UTF-8 string to normal form D, canonical decomposition.
 132       * Fast return for pure ASCII strings.
 133       *
 134       * @param string $string a valid UTF-8 string. Input is not validated.
 135       * @return string a UTF-8 string in normal form D
 136       */
 137  	static function toNFD( $string ) {
 138          if ( NORMALIZE_INTL )
 139              return normalizer_normalize( $string, Normalizer::FORM_D );
 140          elseif ( NORMALIZE_ICU )
 141              return utf8_normalize( $string, self::UNORM_NFD );
 142          elseif ( preg_match( '/[\x80-\xff]/', $string ) )
 143              return UtfNormal::NFD( $string );
 144          else
 145              return $string;
 146      }
 147  
 148      /**
 149       * Convert a UTF-8 string to normal form KC, compatibility composition.
 150       * This may cause irreversible information loss, use judiciously.
 151       * Fast return for pure ASCII strings.
 152       *
 153       * @param string $string a valid UTF-8 string. Input is not validated.
 154       * @return string a UTF-8 string in normal form KC
 155       */
 156  	static function toNFKC( $string ) {
 157          if ( NORMALIZE_INTL )
 158              return normalizer_normalize( $string, Normalizer::FORM_KC );
 159          elseif ( NORMALIZE_ICU )
 160              return utf8_normalize( $string, self::UNORM_NFKC );
 161          elseif ( preg_match( '/[\x80-\xff]/', $string ) )
 162              return UtfNormal::NFKC( $string );
 163          else
 164              return $string;
 165      }
 166  
 167      /**
 168       * Convert a UTF-8 string to normal form KD, compatibility decomposition.
 169       * This may cause irreversible information loss, use judiciously.
 170       * Fast return for pure ASCII strings.
 171       *
 172       * @param string $string a valid UTF-8 string. Input is not validated.
 173       * @return string a UTF-8 string in normal form KD
 174       */
 175  	static function toNFKD( $string ) {
 176          if ( NORMALIZE_INTL )
 177              return normalizer_normalize( $string, Normalizer::FORM_KD );
 178          elseif ( NORMALIZE_ICU )
 179              return utf8_normalize( $string, self::UNORM_NFKD );
 180          elseif ( preg_match( '/[\x80-\xff]/', $string ) )
 181              return UtfNormal::NFKD( $string );
 182          else
 183              return $string;
 184      }
 185  
 186      /**
 187       * Load the basic composition data if necessary
 188       * @private
 189       */
 190  	static function loadData() {
 191          if ( !isset( self::$utfCombiningClass ) ) {
 192              require_once  __DIR__ . '/UtfNormalData.inc';
 193          }
 194      }
 195  
 196      /**
 197       * Returns true if the string is _definitely_ in NFC.
 198       * Returns false if not or uncertain.
 199       * @param string $string a valid UTF-8 string. Input is not validated.
 200       * @return bool
 201       */
 202  	static function quickIsNFC( $string ) {
 203          # ASCII is always valid NFC!
 204          # If it's pure ASCII, let it through.
 205          if ( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
 206  
 207          UtfNormal::loadData();
 208          $len = strlen( $string );
 209          for ( $i = 0; $i < $len; $i++ ) {
 210              $c = $string[$i];
 211              $n = ord( $c );
 212              if ( $n < 0x80 ) {
 213                  continue;
 214              } elseif ( $n >= 0xf0 ) {
 215                  $c = substr( $string, $i, 4 );
 216                  $i += 3;
 217              } elseif ( $n >= 0xe0 ) {
 218                  $c = substr( $string, $i, 3 );
 219                  $i += 2;
 220              } elseif ( $n >= 0xc0 ) {
 221                  $c = substr( $string, $i, 2 );
 222                  $i++;
 223              }
 224              if ( isset( self::$utfCheckNFC[$c] ) ) {
 225                  # If it's NO or MAYBE, bail and do the slow check.
 226                  return false;
 227              }
 228              if ( isset( self::$utfCombiningClass[$c] ) ) {
 229                  # Combining character? We might have to do sorting, at least.
 230                  return false;
 231              }
 232          }
 233  
 234          return true;
 235      }
 236  
 237      /**
 238       * Returns true if the string is _definitely_ in NFC.
 239       * Returns false if not or uncertain.
 240       * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
 241       * @return bool
 242       */
 243  	static function quickIsNFCVerify( &$string ) {
 244          # Screen out some characters that eg won't be allowed in XML
 245          $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
 246  
 247          # ASCII is always valid NFC!
 248          # If we're only ever given plain ASCII, we can avoid the overhead
 249          # of initializing the decomposition tables by skipping out early.
 250          if ( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
 251  
 252          static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
 253          if ( !isset( $checkit ) ) {
 254              # Load/build some scary lookup tables...
 255              UtfNormal::loadData();
 256  
 257              $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
 258  
 259              # Head bytes for sequences which we should do further validity checks
 260              $checkit = array_flip( array_map( 'chr',
 261                  array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
 262                      0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
 263                      0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
 264  
 265              # Each UTF-8 head byte is followed by a certain
 266              # number of tail bytes.
 267              $tailBytes = array();
 268              for ( $n = 0; $n < 256; $n++ ) {
 269                  if ( $n < 0xc0 ) {
 270                      $remaining = 0;
 271                  } elseif ( $n < 0xe0 ) {
 272                      $remaining = 1;
 273                  } elseif ( $n < 0xf0 ) {
 274                      $remaining = 2;
 275                  } elseif ( $n < 0xf8 ) {
 276                      $remaining = 3;
 277                  } elseif ( $n < 0xfc ) {
 278                      $remaining = 4;
 279                  } elseif ( $n < 0xfe ) {
 280                      $remaining = 5;
 281                  } else {
 282                      $remaining = 0;
 283                  }
 284                  $tailBytes[chr( $n )] = $remaining;
 285              }
 286          }
 287  
 288          # Chop the text into pure-ASCII and non-ASCII areas;
 289          # large ASCII parts can be handled much more quickly.
 290          # Don't chop up Unicode areas for punctuation, though,
 291          # that wastes energy.
 292          $matches = array();
 293          preg_match_all(
 294              '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
 295              $string, $matches );
 296  
 297          $looksNormal = true;
 298          $base = 0;
 299          $replace = array();
 300          foreach ( $matches[1] as $str ) {
 301              $chunk = strlen( $str );
 302  
 303              if ( $str[0] < "\x80" ) {
 304                  # ASCII chunk: guaranteed to be valid UTF-8
 305                  # and in normal form C, so skip over it.
 306                  $base += $chunk;
 307                  continue;
 308              }
 309  
 310              # We'll have to examine the chunk byte by byte to ensure
 311              # that it consists of valid UTF-8 sequences, and to see
 312              # if any of them might not be normalized.
 313              #
 314              # Since PHP is not the fastest language on earth, some of
 315              # this code is a little ugly with inner loop optimizations.
 316  
 317              $head = '';
 318              $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
 319  
 320              for ( $i = -1; --$len; ) {
 321                  $remaining = $tailBytes[$c = $str[++$i]];
 322                  if ( $remaining ) {
 323                      # UTF-8 head byte!
 324                      $sequence = $head = $c;
 325                      do {
 326                          # Look for the defined number of tail bytes...
 327                          if ( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
 328                              # Legal tail bytes are nice.
 329                              $sequence .= $c;
 330                          } else {
 331                              if ( 0 == $len ) {
 332                                  # Premature end of string!
 333                                  # Drop a replacement character into output to
 334                                  # represent the invalid UTF-8 sequence.
 335                                  $replace[] = array( UTF8_REPLACEMENT,
 336                                      $base + $i + 1 - strlen( $sequence ),
 337                                      strlen( $sequence ) );
 338                                  break 2;
 339                              } else {
 340                                  # Illegal tail byte; abandon the sequence.
 341                                  $replace[] = array( UTF8_REPLACEMENT,
 342                                      $base + $i - strlen( $sequence ),
 343                                      strlen( $sequence ) );
 344                                  # Back up and reprocess this byte; it may itself
 345                                  # be a legal ASCII or UTF-8 sequence head.
 346                                  --$i;
 347                                  ++$len;
 348                                  continue 2;
 349                              }
 350                          }
 351                      } while ( --$remaining );
 352  
 353                      if ( isset( $checkit[$head] ) ) {
 354                          # Do some more detailed validity checks, for
 355                          # invalid characters and illegal sequences.
 356                          if ( $head == "\xed" ) {
 357                              # 0xed is relatively frequent in Korean, which
 358                              # abuts the surrogate area, so we're doing
 359                              # this check separately to speed things up.
 360  
 361                              if ( $sequence >= UTF8_SURROGATE_FIRST ) {
 362                                  # Surrogates are legal only in UTF-16 code.
 363                                  # They are totally forbidden here in UTF-8
 364                                  # utopia.
 365                                  $replace[] = array( UTF8_REPLACEMENT,
 366                                      $base + $i + 1 - strlen( $sequence ),
 367                                      strlen( $sequence ) );
 368                                  $head = '';
 369                                  continue;
 370                              }
 371                          } else {
 372                              # Slower, but rarer checks...
 373                              $n = ord( $head );
 374                              if (
 375                                  # "Overlong sequences" are those that are syntactically
 376                                  # correct but use more UTF-8 bytes than are necessary to
 377                                  # encode a character. Naïve string comparisons can be
 378                                  # tricked into failing to see a match for an ASCII
 379                                  # character, for instance, which can be a security hole
 380                                  # if blacklist checks are being used.
 381                                  ( $n < 0xc2 && $sequence <= UTF8_OVERLONG_A )
 382                                  || ( $n == 0xe0 && $sequence <= UTF8_OVERLONG_B )
 383                                  || ( $n == 0xf0 && $sequence <= UTF8_OVERLONG_C )
 384  
 385                                  # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
 386                                  || ( $n == 0xef &&
 387                                      ( $sequence == UTF8_FFFE )
 388                                      || ( $sequence == UTF8_FFFF ) )
 389  
 390                                  # Unicode has been limited to 21 bits; longer
 391                                  # sequences are not allowed.
 392                                  || ( $n >= 0xf0 && $sequence > UTF8_MAX )
 393                              ) {
 394  
 395                                  $replace[] = array( UTF8_REPLACEMENT,
 396                                      $base + $i + 1 - strlen( $sequence ),
 397                                      strlen( $sequence ) );
 398                                  $head = '';
 399                                  continue;
 400                              }
 401                          }
 402                      }
 403  
 404                      if ( isset( $utfCheckOrCombining[$sequence] ) ) {
 405                          # If it's NO or MAYBE, we'll have to rip
 406                          # the string apart and put it back together.
 407                          # That's going to be mighty slow.
 408                          $looksNormal = false;
 409                      }
 410  
 411                      # The sequence is legal!
 412                      $head = '';
 413                  } elseif ( $c < "\x80" ) {
 414                      # ASCII byte.
 415                      $head = '';
 416                  } elseif ( $c < "\xc0" ) {
 417                      # Illegal tail bytes
 418                      if ( $head == '' ) {
 419                          # Out of the blue!
 420                          $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
 421                      } else {
 422                          # Don't add if we're continuing a broken sequence;
 423                          # we already put a replacement character when we looked
 424                          # at the broken sequence.
 425                          $replace[] = array( '', $base + $i, 1 );
 426                      }
 427                  } else {
 428                      # Miscellaneous freaks.
 429                      $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
 430                      $head = '';
 431                  }
 432              }
 433              $base += $chunk;
 434          }
 435          if ( count( $replace ) ) {
 436              # There were illegal UTF-8 sequences we need to fix up.
 437              $out = '';
 438              $last = 0;
 439              foreach ( $replace as $rep ) {
 440                  list( $replacement, $start, $length ) = $rep;
 441                  if ( $last < $start ) {
 442                      $out .= substr( $string, $last, $start - $last );
 443                  }
 444                  $out .= $replacement;
 445                  $last = $start + $length;
 446              }
 447              if ( $last < strlen( $string ) ) {
 448                  $out .= substr( $string, $last );
 449              }
 450              $string = $out;
 451          }
 452  
 453          return $looksNormal;
 454      }
 455  
 456      # These take a string and run the normalization on them, without
 457      # checking for validity or any optimization etc. Input must be
 458      # VALID UTF-8!
 459      /**
 460       * @param $string string
 461       * @return string
 462       * @private
 463       */
 464  	static function NFC( $string ) {
 465          return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
 466      }
 467  
 468      /**
 469       * @param $string string
 470       * @return string
 471       * @private
 472       */
 473  	static function NFD( $string ) {
 474          UtfNormal::loadData();
 475  
 476          return UtfNormal::fastCombiningSort(
 477              UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) );
 478      }
 479  
 480      /**
 481       * @param $string string
 482       * @return string
 483       * @private
 484       */
 485  	static function NFKC( $string ) {
 486          return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
 487      }
 488  
 489      /**
 490       * @param $string string
 491       * @return string
 492       * @private
 493       */
 494  	static function NFKD( $string ) {
 495          if ( !isset( self::$utfCompatibilityDecomp ) ) {
 496              require_once  'UtfNormalDataK.inc';
 497          }
 498  
 499          return self::fastCombiningSort(
 500              self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
 501      }
 502  
 503      /**
 504       * Perform decomposition of a UTF-8 string into either D or KD form
 505       * (depending on which decomposition map is passed to us).
 506       * Input is assumed to be *valid* UTF-8. Invalid code will break.
 507       * @private
 508       * @param string $string valid UTF-8 string
 509       * @param array $map hash of expanded decomposition map
 510       * @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
 511       */
 512  	static function fastDecompose( $string, $map ) {
 513          UtfNormal::loadData();
 514          $len = strlen( $string );
 515          $out = '';
 516          for ( $i = 0; $i < $len; $i++ ) {
 517              $c = $string[$i];
 518              $n = ord( $c );
 519              if ( $n < 0x80 ) {
 520                  # ASCII chars never decompose
 521                  # THEY ARE IMMORTAL
 522                  $out .= $c;
 523                  continue;
 524              } elseif ( $n >= 0xf0 ) {
 525                  $c = substr( $string, $i, 4 );
 526                  $i += 3;
 527              } elseif ( $n >= 0xe0 ) {
 528                  $c = substr( $string, $i, 3 );
 529                  $i += 2;
 530              } elseif ( $n >= 0xc0 ) {
 531                  $c = substr( $string, $i, 2 );
 532                  $i++;
 533              }
 534              if ( isset( $map[$c] ) ) {
 535                  $out .= $map[$c];
 536                  continue;
 537              } else {
 538                  if ( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
 539                      # Decompose a hangul syllable into jamo;
 540                      # hardcoded for three-byte UTF-8 sequence.
 541                      # A lookup table would be slightly faster,
 542                      # but adds a lot of memory & disk needs.
 543                      #
 544                      $index = ( ( ord( $c[0] ) & 0x0f ) << 12
 545                              | ( ord( $c[1] ) & 0x3f ) << 6
 546                              | ( ord( $c[2] ) & 0x3f ) )
 547                          - UNICODE_HANGUL_FIRST;
 548                      $l = intval( $index / UNICODE_HANGUL_NCOUNT );
 549                      $v = intval( ( $index % UNICODE_HANGUL_NCOUNT ) / UNICODE_HANGUL_TCOUNT );
 550                      $t = $index % UNICODE_HANGUL_TCOUNT;
 551                      $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
 552                      if ( $t >= 25 ) {
 553                          $out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
 554                      } elseif ( $t ) {
 555                          $out .= "\xe1\x86" . chr( 0xa7 + $t );
 556                      }
 557                      continue;
 558                  }
 559              }
 560              $out .= $c;
 561          }
 562  
 563          return $out;
 564      }
 565  
 566      /**
 567       * Sorts combining characters into canonical order. This is the
 568       * final step in creating decomposed normal forms D and KD.
 569       * @private
 570       * @param string $string a valid, decomposed UTF-8 string. Input is not validated.
 571       * @return string a UTF-8 string with combining characters sorted in canonical order
 572       */
 573  	static function fastCombiningSort( $string ) {
 574          UtfNormal::loadData();
 575          $len = strlen( $string );
 576          $out = '';
 577          $combiners = array();
 578          $lastClass = -1;
 579          for ( $i = 0; $i < $len; $i++ ) {
 580              $c = $string[$i];
 581              $n = ord( $c );
 582              if ( $n >= 0x80 ) {
 583                  if ( $n >= 0xf0 ) {
 584                      $c = substr( $string, $i, 4 );
 585                      $i += 3;
 586                  } elseif ( $n >= 0xe0 ) {
 587                      $c = substr( $string, $i, 3 );
 588                      $i += 2;
 589                  } elseif ( $n >= 0xc0 ) {
 590                      $c = substr( $string, $i, 2 );
 591                      $i++;
 592                  }
 593                  if ( isset( self::$utfCombiningClass[$c] ) ) {
 594                      $lastClass = self::$utfCombiningClass[$c];
 595                      if ( isset( $combiners[$lastClass] ) ) {
 596                          $combiners[$lastClass] .= $c;
 597                      } else {
 598                          $combiners[$lastClass] = $c;
 599                      }
 600                      continue;
 601                  }
 602              }
 603              if ( $lastClass ) {
 604                  ksort( $combiners );
 605                  $out .= implode( '', $combiners );
 606                  $combiners = array();
 607              }
 608              $out .= $c;
 609              $lastClass = 0;
 610          }
 611          if ( $lastClass ) {
 612              ksort( $combiners );
 613              $out .= implode( '', $combiners );
 614          }
 615  
 616          return $out;
 617      }
 618  
 619      /**
 620       * Produces canonically composed sequences, i.e. normal form C or KC.
 621       *
 622       * @private
 623       * @param string $string a valid UTF-8 string in sorted normal form D or KD.
 624       *   Input is not validated.
 625       * @return string a UTF-8 string with canonical precomposed characters used
 626       *   where possible.
 627       */
 628  	static function fastCompose( $string ) {
 629          UtfNormal::loadData();
 630          $len = strlen( $string );
 631          $out = '';
 632          $lastClass = -1;
 633          $lastHangul = 0;
 634          $startChar = '';
 635          $combining = '';
 636          $x1 = ord( substr( UTF8_HANGUL_VBASE, 0, 1 ) );
 637          $x2 = ord( substr( UTF8_HANGUL_TEND, 0, 1 ) );
 638          for ( $i = 0; $i < $len; $i++ ) {
 639              $c = $string[$i];
 640              $n = ord( $c );
 641              if ( $n < 0x80 ) {
 642                  # No combining characters here...
 643                  $out .= $startChar;
 644                  $out .= $combining;
 645                  $startChar = $c;
 646                  $combining = '';
 647                  $lastClass = 0;
 648                  continue;
 649              } elseif ( $n >= 0xf0 ) {
 650                  $c = substr( $string, $i, 4 );
 651                  $i += 3;
 652              } elseif ( $n >= 0xe0 ) {
 653                  $c = substr( $string, $i, 3 );
 654                  $i += 2;
 655              } elseif ( $n >= 0xc0 ) {
 656                  $c = substr( $string, $i, 2 );
 657                  $i++;
 658              }
 659              $pair = $startChar . $c;
 660              if ( $n > 0x80 ) {
 661                  if ( isset( self::$utfCombiningClass[$c] ) ) {
 662                      # A combining char; see what we can do with it
 663                      $class = self::$utfCombiningClass[$c];
 664                      if ( !empty( $startChar ) &&
 665                          $lastClass < $class &&
 666                          $class > 0 &&
 667                          isset( self::$utfCanonicalComp[$pair] )
 668                      ) {
 669                          $startChar = self::$utfCanonicalComp[$pair];
 670                          $class = 0;
 671                      } else {
 672                          $combining .= $c;
 673                      }
 674                      $lastClass = $class;
 675                      $lastHangul = 0;
 676                      continue;
 677                  }
 678              }
 679              # New start char
 680              if ( $lastClass == 0 ) {
 681                  if ( isset( self::$utfCanonicalComp[$pair] ) ) {
 682                      $startChar = self::$utfCanonicalComp[$pair];
 683                      $lastHangul = 0;
 684                      continue;
 685                  }
 686                  if ( $n >= $x1 && $n <= $x2 ) {
 687                      # WARNING: Hangul code is painfully slow.
 688                      # I apologize for this ugly, ugly code; however
 689                      # performance is even more teh suck if we call
 690                      # out to nice clean functions. Lookup tables are
 691                      # marginally faster, but require a lot of space.
 692                      #
 693                      if ( $c >= UTF8_HANGUL_VBASE &&
 694                          $c <= UTF8_HANGUL_VEND &&
 695                          $startChar >= UTF8_HANGUL_LBASE &&
 696                          $startChar <= UTF8_HANGUL_LEND
 697                      ) {
 698                          #
 699                          #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
 700                          #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
 701                          $lIndex = ord( $startChar[2] ) - 0x80;
 702                          $vIndex = ord( $c[2] ) - 0xa1;
 703  
 704                          $hangulPoint = UNICODE_HANGUL_FIRST +
 705                              UNICODE_HANGUL_TCOUNT *
 706                              ( UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex );
 707  
 708                          # Hardcode the limited-range UTF-8 conversion:
 709                          $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
 710                              chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
 711                              chr( $hangulPoint & 0x3f | 0x80 );
 712                          $lastHangul = 0;
 713                          continue;
 714                      } elseif ( $c >= UTF8_HANGUL_TBASE &&
 715                          $c <= UTF8_HANGUL_TEND &&
 716                          $startChar >= UTF8_HANGUL_FIRST &&
 717                          $startChar <= UTF8_HANGUL_LAST &&
 718                          !$lastHangul
 719                      ) {
 720                          # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
 721                          $tIndex = ord( $c[2] ) - 0xa7;
 722                          if ( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + ( 0x11c0 - 0x11a7 );
 723  
 724                          # Increment the code point by $tIndex, without
 725                          # the function overhead of decoding and recoding UTF-8
 726                          #
 727                          $tail = ord( $startChar[2] ) + $tIndex;
 728                          if ( $tail > 0xbf ) {
 729                              $tail -= 0x40;
 730                              $mid = ord( $startChar[1] ) + 1;
 731                              if ( $mid > 0xbf ) {
 732                                  $startChar[0] = chr( ord( $startChar[0] ) + 1 );
 733                                  $mid -= 0x40;
 734                              }
 735                              $startChar[1] = chr( $mid );
 736                          }
 737                          $startChar[2] = chr( $tail );
 738  
 739                          # If there's another jamo char after this, *don't* try to merge it.
 740                          $lastHangul = 1;
 741                          continue;
 742                      }
 743                  }
 744              }
 745              $out .= $startChar;
 746              $out .= $combining;
 747              $startChar = $c;
 748              $combining = '';
 749              $lastClass = 0;
 750              $lastHangul = 0;
 751          }
 752          $out .= $startChar . $combining;
 753  
 754          return $out;
 755      }
 756  
 757      /**
 758       * This is just used for the benchmark, comparing how long it takes to
 759       * interate through a string without really doing anything of substance.
 760       * @param $string string
 761       * @return string
 762       */
 763  	static function placebo( $string ) {
 764          $len = strlen( $string );
 765          $out = '';
 766          for ( $i = 0; $i < $len; $i++ ) {
 767              $out .= $string[$i];
 768          }
 769  
 770          return $out;
 771      }
 772  
 773      /**
 774       * Function to replace some characters that we don't want
 775       * but most of the native normalize functions keep.
 776       *
 777       * @param string $string The string
 778       * @return String String with the character codes replaced.
 779       */
 780  	private static function replaceForNativeNormalize( $string ) {
 781          $string = preg_replace(
 782              '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
 783              UTF8_REPLACEMENT,
 784              $string );
 785          $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
 786          $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
 787  
 788          return $string;
 789      }
 790  }
PHP Cross Reference of MediaWiki-1.24.0

/includes/normal/ -> UtfNormal.php (source)