php/html/UtfNormal_8php_source.html

00001 <?php
00031 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
00032 define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
00033
00048 class UtfNormal {
00052         const UNORM_NONE = 1;
00053         const UNORM_NFD  = 2;
00054         const UNORM_NFKD = 3;
00055         const UNORM_NFC  = 4;
00056         const UNORM_NFKC = 5;
00057         const UNORM_FCD  = 6;
00058         const UNORM_DEFAULT = self::UNORM_NFC;
00059
00060         static $utfCombiningClass = null;
00061         static $utfCanonicalComp = null;
00062         static $utfCanonicalDecomp = null;
00063
00064         # Load compatibility decompositions on demand if they are needed.
00065         static $utfCompatibilityDecomp = null;
00066
00067         static $utfCheckNFC;
00068
00079         static function cleanUp( $string ) {
00080                 if( NORMALIZE_ICU ) {
00081                         $string = self::replaceForNativeNormalize( $string );
00082
00083                         # UnicodeString constructor fails if the string ends with a
00084                         # head byte. Add a junk char at the end, we'll strip it off.
00085                         return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" );
00086                 } elseif( NORMALIZE_INTL ) {
00087                         $string = self::replaceForNativeNormalize( $string );
00088                         $norm = normalizer_normalize( $string, Normalizer::FORM_C );
00089                         if( $norm === null || $norm === false ) {
00090                                 # normalizer_normalize will either return false or null
00091                                 # (depending on which doc you read) if invalid utf8 string.
00092                                 # quickIsNFCVerify cleans up invalid sequences.
00093
00094                                 if( UtfNormal::quickIsNFCVerify( $string ) ) {
00095                                         # if that's true, the string is actually already normal.
00096                                         return $string;
00097                                 } else {
00098                                         # Now we are valid but non-normal
00099                                         return normalizer_normalize( $string, Normalizer::FORM_C );
00100                                 }
00101                         } else {
00102                                 return $norm;
00103                         }
00104                 } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
00105                         # Side effect -- $string has had UTF-8 errors cleaned up.
00106                         return $string;
00107                 } else {
00108                         return UtfNormal::NFC( $string );
00109                 }
00110         }
00111
00120         static function toNFC( $string ) {
00121                 if( NORMALIZE_INTL )
00122                         return normalizer_normalize( $string, Normalizer::FORM_C );
00123                 elseif( NORMALIZE_ICU )
00124                         return utf8_normalize( $string, self::UNORM_NFC );
00125                 elseif( UtfNormal::quickIsNFC( $string ) )
00126                         return $string;
00127                 else
00128                         return UtfNormal::NFC( $string );
00129         }
00130
00138         static function toNFD( $string ) {
00139                 if( NORMALIZE_INTL )
00140                         return normalizer_normalize( $string, Normalizer::FORM_D );
00141                 elseif( NORMALIZE_ICU )
00142                         return utf8_normalize( $string, self::UNORM_NFD );
00143                 elseif( preg_match( '/[\x80-\xff]/', $string ) )
00144                         return UtfNormal::NFD( $string );
00145                 else
00146                         return $string;
00147         }
00148
00157         static function toNFKC( $string ) {
00158                 if( NORMALIZE_INTL )
00159                         return normalizer_normalize( $string, Normalizer::FORM_KC );
00160                 elseif( NORMALIZE_ICU )
00161                         return utf8_normalize( $string, self::UNORM_NFKC );
00162                 elseif( preg_match( '/[\x80-\xff]/', $string ) )
00163                         return UtfNormal::NFKC( $string );
00164                 else
00165                         return $string;
00166         }
00167
00176         static function toNFKD( $string ) {
00177                 if( NORMALIZE_INTL )
00178                         return normalizer_normalize( $string, Normalizer::FORM_KD );
00179                 elseif( NORMALIZE_ICU )
00180                         return utf8_normalize( $string, self::UNORM_NFKD );
00181                 elseif( preg_match( '/[\x80-\xff]/', $string ) )
00182                         return UtfNormal::NFKD( $string );
00183                 else
00184                         return $string;
00185         }
00186
00191         static function loadData() {
00192                 if( !isset( self::$utfCombiningClass ) ) {
00193                         require_once( __DIR__ . '/UtfNormalData.inc' );
00194                 }
00195         }
00196
00203         static function quickIsNFC( $string ) {
00204                 # ASCII is always valid NFC!
00205                 # If it's pure ASCII, let it through.
00206                 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
00207
00208                 UtfNormal::loadData();
00209                 $len = strlen( $string );
00210                 for( $i = 0; $i < $len; $i++ ) {
00211                         $c = $string[$i];
00212                         $n = ord( $c );
00213                         if( $n < 0x80 ) {
00214                                 continue;
00215                         } elseif( $n >= 0xf0 ) {
00216                                 $c = substr( $string, $i, 4 );
00217                                 $i += 3;
00218                         } elseif( $n >= 0xe0 ) {
00219                                 $c = substr( $string, $i, 3 );
00220                                 $i += 2;
00221                         } elseif( $n >= 0xc0 ) {
00222                                 $c = substr( $string, $i, 2 );
00223                                 $i++;
00224                         }
00225                         if( isset( self::$utfCheckNFC[$c] ) ) {
00226                                 # If it's NO or MAYBE, bail and do the slow check.
00227                                 return false;
00228                         }
00229                         if( isset( self::$utfCombiningClass[$c] ) ) {
00230                                 # Combining character? We might have to do sorting, at least.
00231                                 return false;
00232                         }
00233                 }
00234                 return true;
00235         }
00236
00243         static function quickIsNFCVerify( &$string ) {
00244                 # Screen out some characters that eg won't be allowed in XML
00245                 $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
00246
00247                 # ASCII is always valid NFC!
00248                 # If we're only ever given plain ASCII, we can avoid the overhead
00249                 # of initializing the decomposition tables by skipping out early.
00250                 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
00251
00252                 static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
00253                 if( !isset( $checkit ) ) {
00254                         # Load/build some scary lookup tables...
00255                         UtfNormal::loadData();
00256
00257                         $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
00258
00259                         # Head bytes for sequences which we should do further validity checks
00260                         $checkit = array_flip( array_map( 'chr',
00261                                         array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
00262                                                    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
00263                                                    0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
00264
00265                         # Each UTF-8 head byte is followed by a certain
00266                         # number of tail bytes.
00267                         $tailBytes = array();
00268                         for( $n = 0; $n < 256; $n++ ) {
00269                                 if( $n < 0xc0 ) {
00270                                         $remaining = 0;
00271                                 } elseif( $n < 0xe0 ) {
00272                                         $remaining = 1;
00273                                 } elseif( $n < 0xf0 ) {
00274                                         $remaining = 2;
00275                                 } elseif( $n < 0xf8 ) {
00276                                         $remaining = 3;
00277                                 } elseif( $n < 0xfc ) {
00278                                         $remaining = 4;
00279                                 } elseif( $n < 0xfe ) {
00280                                         $remaining = 5;
00281                                 } else {
00282                                         $remaining = 0;
00283                                 }
00284                                 $tailBytes[chr($n)] = $remaining;
00285                         }
00286                 }
00287
00288                 # Chop the text into pure-ASCII and non-ASCII areas;
00289                 # large ASCII parts can be handled much more quickly.
00290                 # Don't chop up Unicode areas for punctuation, though,
00291                 # that wastes energy.
00292                 $matches = array();
00293                 preg_match_all(
00294                         '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
00295                         $string, $matches );
00296
00297                 $looksNormal = true;
00298                 $base = 0;
00299                 $replace = array();
00300                 foreach( $matches[1] as $str ) {
00301                         $chunk = strlen( $str );
00302
00303                         if( $str[0] < "\x80" ) {
00304                                 # ASCII chunk: guaranteed to be valid UTF-8
00305                                 # and in normal form C, so skip over it.
00306                                 $base += $chunk;
00307                                 continue;
00308                         }
00309
00310                         # We'll have to examine the chunk byte by byte to ensure
00311                         # that it consists of valid UTF-8 sequences, and to see
00312                         # if any of them might not be normalized.
00313                         #
00314                         # Since PHP is not the fastest language on earth, some of
00315                         # this code is a little ugly with inner loop optimizations.
00316
00317                         $head = '';
00318                         $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
00319
00320                         for( $i = -1; --$len; ) {
00321                                 $remaining = $tailBytes[$c = $str[++$i]];
00322                                 if( $remaining ) {
00323                                         # UTF-8 head byte!
00324                                         $sequence = $head = $c;
00325                                         do {
00326                                                 # Look for the defined number of tail bytes...
00327                                                 if( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
00328                                                         # Legal tail bytes are nice.
00329                                                         $sequence .= $c;
00330                                                 } else {
00331                                                         if( 0 == $len ) {
00332                                                                 # Premature end of string!
00333                                                                 # Drop a replacement character into output to
00334                                                                 # represent the invalid UTF-8 sequence.
00335                                                                 $replace[] = array( UTF8_REPLACEMENT,
00336                                                                                                         $base + $i + 1 - strlen( $sequence ),
00337                                                                                                         strlen( $sequence ) );
00338                                                                 break 2;
00339                                                         } else {
00340                                                                 # Illegal tail byte; abandon the sequence.
00341                                                                 $replace[] = array( UTF8_REPLACEMENT,
00342                                                                                                         $base + $i - strlen( $sequence ),
00343                                                                                                         strlen( $sequence ) );
00344                                                                 # Back up and reprocess this byte; it may itself
00345                                                                 # be a legal ASCII or UTF-8 sequence head.
00346                                                                 --$i;
00347                                                                 ++$len;
00348                                                                 continue 2;
00349                                                         }
00350                                                 }
00351                                         } while( --$remaining );
00352
00353                                         if( isset( $checkit[$head] ) ) {
00354                                                 # Do some more detailed validity checks, for
00355                                                 # invalid characters and illegal sequences.
00356                                                 if( $head == "\xed" ) {
00357                                                         # 0xed is relatively frequent in Korean, which
00358                                                         # abuts the surrogate area, so we're doing
00359                                                         # this check separately to speed things up.
00360
00361                                                         if( $sequence >= UTF8_SURROGATE_FIRST ) {
00362                                                                 # Surrogates are legal only in UTF-16 code.
00363                                                                 # They are totally forbidden here in UTF-8
00364                                                                 # utopia.
00365                                                                 $replace[] = array( UTF8_REPLACEMENT,
00366                                                                              $base + $i + 1 - strlen( $sequence ),
00367                                                                              strlen( $sequence ) );
00368                                                                 $head = '';
00369                                                                 continue;
00370                                                         }
00371                                                 } else {
00372                                                         # Slower, but rarer checks...
00373                                                         $n = ord( $head );
00374                                                         if(
00375                                                                 # "Overlong sequences" are those that are syntactically
00376                                                                 # correct but use more UTF-8 bytes than are necessary to
00377                                                                 # encode a character. Naïve string comparisons can be
00378                                                                 # tricked into failing to see a match for an ASCII
00379                                                                 # character, for instance, which can be a security hole
00380                                                                 # if blacklist checks are being used.
00381                                                                ($n  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
00382                                                                 || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
00383                                                                 || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
00384
00385                                                                 # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
00386                                                                 || ($n == 0xef &&
00387                                                                            ($sequence == UTF8_FFFE)
00388                                                                         || ($sequence == UTF8_FFFF) )
00389
00390                                                                 # Unicode has been limited to 21 bits; longer
00391                                                                 # sequences are not allowed.
00392                                                                 || ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
00393
00394                                                                 $replace[] = array( UTF8_REPLACEMENT,
00395                                                                                     $base + $i + 1 - strlen( $sequence ),
00396                                                                                     strlen( $sequence ) );
00397                                                                 $head = '';
00398                                                                 continue;
00399                                                         }
00400                                                 }
00401                                         }
00402
00403                                         if( isset( $utfCheckOrCombining[$sequence] ) ) {
00404                                                 # If it's NO or MAYBE, we'll have to rip
00405                                                 # the string apart and put it back together.
00406                                                 # That's going to be mighty slow.
00407                                                 $looksNormal = false;
00408                                         }
00409
00410                                         # The sequence is legal!
00411                                         $head = '';
00412                                 } elseif( $c < "\x80" ) {
00413                                         # ASCII byte.
00414                                         $head = '';
00415                                 } elseif( $c < "\xc0" ) {
00416                                         # Illegal tail bytes
00417                                         if( $head == '' ) {
00418                                                 # Out of the blue!
00419                                                 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
00420                                         } else {
00421                                                 # Don't add if we're continuing a broken sequence;
00422                                                 # we already put a replacement character when we looked
00423                                                 # at the broken sequence.
00424                                                 $replace[] = array( '', $base + $i, 1 );
00425                                         }
00426                                 } else {
00427                                         # Miscellaneous freaks.
00428                                         $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
00429                                         $head = '';
00430                                 }
00431                         }
00432                         $base += $chunk;
00433                 }
00434                 if( count( $replace ) ) {
00435                         # There were illegal UTF-8 sequences we need to fix up.
00436                         $out = '';
00437                         $last = 0;
00438                         foreach( $replace as $rep ) {
00439                                 list( $replacement, $start, $length ) = $rep;
00440                                 if( $last < $start ) {
00441                                         $out .= substr( $string, $last, $start - $last );
00442                                 }
00443                                 $out .= $replacement;
00444                                 $last = $start + $length;
00445                         }
00446                         if( $last < strlen( $string ) ) {
00447                                 $out .= substr( $string, $last );
00448                         }
00449                         $string = $out;
00450                 }
00451                 return $looksNormal;
00452         }
00453
00454         # These take a string and run the normalization on them, without
00455         # checking for validity or any optimization etc. Input must be
00456         # VALID UTF-8!
00457
00462         static function NFC( $string ) {
00463                 return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
00464         }
00465
00471         static function NFD( $string ) {
00472                 UtfNormal::loadData();
00473
00474                 return UtfNormal::fastCombiningSort(
00475                         UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) );
00476         }
00477
00483         static function NFKC( $string ) {
00484                 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
00485         }
00486
00492         static function NFKD( $string ) {
00493                 if( !isset( self::$utfCompatibilityDecomp ) ) {
00494                         require_once( 'UtfNormalDataK.inc' );
00495                 }
00496                 return self::fastCombiningSort(
00497                         self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
00498         }
00499
00500
00510         static function fastDecompose( $string, $map ) {
00511                 UtfNormal::loadData();
00512                 $len = strlen( $string );
00513                 $out = '';
00514                 for( $i = 0; $i < $len; $i++ ) {
00515                         $c = $string[$i];
00516                         $n = ord( $c );
00517                         if( $n < 0x80 ) {
00518                                 # ASCII chars never decompose
00519                                 # THEY ARE IMMORTAL
00520                                 $out .= $c;
00521                                 continue;
00522                         } elseif( $n >= 0xf0 ) {
00523                                 $c = substr( $string, $i, 4 );
00524                                 $i += 3;
00525                         } elseif( $n >= 0xe0 ) {
00526                                 $c = substr( $string, $i, 3 );
00527                                 $i += 2;
00528                         } elseif( $n >= 0xc0 ) {
00529                                 $c = substr( $string, $i, 2 );
00530                                 $i++;
00531                         }
00532                         if( isset( $map[$c] ) ) {
00533                                 $out .= $map[$c];
00534                                 continue;
00535                         } else {
00536                                 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
00537                                         # Decompose a hangul syllable into jamo;
00538                                         # hardcoded for three-byte UTF-8 sequence.
00539                                         # A lookup table would be slightly faster,
00540                                         # but adds a lot of memory & disk needs.
00541                                         #
00542                                         $index = ( (ord( $c[0] ) & 0x0f) << 12
00543                                                  | (ord( $c[1] ) & 0x3f) <<  6
00544                                                  | (ord( $c[2] ) & 0x3f) )
00545                                                - UNICODE_HANGUL_FIRST;
00546                                         $l = intval( $index / UNICODE_HANGUL_NCOUNT );
00547                                         $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
00548                                         $t = $index % UNICODE_HANGUL_TCOUNT;
00549                                         $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
00550                                         if( $t >= 25 ) {
00551                                                 $out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
00552                                         } elseif( $t ) {
00553                                                 $out .= "\xe1\x86" . chr( 0xa7 + $t );
00554                                         }
00555                                         continue;
00556                                 }
00557                         }
00558                         $out .= $c;
00559                 }
00560                 return $out;
00561         }
00562
00570         static function fastCombiningSort( $string ) {
00571                 UtfNormal::loadData();
00572                 $len = strlen( $string );
00573                 $out = '';
00574                 $combiners = array();
00575                 $lastClass = -1;
00576                 for( $i = 0; $i < $len; $i++ ) {
00577                         $c = $string[$i];
00578                         $n = ord( $c );
00579                         if( $n >= 0x80 ) {
00580                                 if( $n >= 0xf0 ) {
00581                                         $c = substr( $string, $i, 4 );
00582                                         $i += 3;
00583                                 } elseif( $n >= 0xe0 ) {
00584                                         $c = substr( $string, $i, 3 );
00585                                         $i += 2;
00586                                 } elseif( $n >= 0xc0 ) {
00587                                         $c = substr( $string, $i, 2 );
00588                                         $i++;
00589                                 }
00590                                 if( isset( self::$utfCombiningClass[$c] ) ) {
00591                                         $lastClass = self::$utfCombiningClass[$c];
00592                                         if( isset( $combiners[$lastClass] ) ) {
00593                                                 $combiners[$lastClass] .= $c;
00594                                         } else {
00595                                                 $combiners[$lastClass] = $c;
00596                                         }
00597                                         continue;
00598                                 }
00599                         }
00600                         if( $lastClass ) {
00601                                 ksort( $combiners );
00602                                 $out .= implode( '', $combiners );
00603                                 $combiners = array();
00604                         }
00605                         $out .= $c;
00606                         $lastClass = 0;
00607                 }
00608                 if( $lastClass ) {
00609                         ksort( $combiners );
00610                         $out .= implode( '', $combiners );
00611                 }
00612                 return $out;
00613         }
00614
00622         static function fastCompose( $string ) {
00623                 UtfNormal::loadData();
00624                 $len = strlen( $string );
00625                 $out = '';
00626                 $lastClass = -1;
00627                 $lastHangul = 0;
00628                 $startChar = '';
00629                 $combining = '';
00630                 $x1 = ord(substr(UTF8_HANGUL_VBASE, 0, 1));
00631                 $x2 = ord(substr(UTF8_HANGUL_TEND, 0, 1));
00632                 for( $i = 0; $i < $len; $i++ ) {
00633                         $c = $string[$i];
00634                         $n = ord( $c );
00635                         if( $n < 0x80 ) {
00636                                 # No combining characters here...
00637                                 $out .= $startChar;
00638                                 $out .= $combining;
00639                                 $startChar = $c;
00640                                 $combining = '';
00641                                 $lastClass = 0;
00642                                 continue;
00643                         } elseif( $n >= 0xf0 ) {
00644                                 $c = substr( $string, $i, 4 );
00645                                 $i += 3;
00646                         } elseif( $n >= 0xe0 ) {
00647                                 $c = substr( $string, $i, 3 );
00648                                 $i += 2;
00649                         } elseif( $n >= 0xc0 ) {
00650                                 $c = substr( $string, $i, 2 );
00651                                 $i++;
00652                         }
00653                         $pair = $startChar . $c;
00654                         if( $n > 0x80 ) {
00655                                 if( isset( self::$utfCombiningClass[$c] ) ) {
00656                                         # A combining char; see what we can do with it
00657                                         $class = self::$utfCombiningClass[$c];
00658                                         if( !empty( $startChar ) &&
00659                                                 $lastClass < $class &&
00660                                                 $class > 0 &&
00661                                                 isset( self::$utfCanonicalComp[$pair] ) ) {
00662                                                 $startChar = self::$utfCanonicalComp[$pair];
00663                                                 $class = 0;
00664                                         } else {
00665                                                 $combining .= $c;
00666                                         }
00667                                         $lastClass = $class;
00668                                         $lastHangul = 0;
00669                                         continue;
00670                                 }
00671                         }
00672                         # New start char
00673                         if( $lastClass == 0 ) {
00674                                 if( isset( self::$utfCanonicalComp[$pair] ) ) {
00675                                         $startChar = self::$utfCanonicalComp[$pair];
00676                                         $lastHangul = 0;
00677                                         continue;
00678                                 }
00679                                 if( $n >= $x1 && $n <= $x2 ) {
00680                                         # WARNING: Hangul code is painfully slow.
00681                                         # I apologize for this ugly, ugly code; however
00682                                         # performance is even more teh suck if we call
00683                                         # out to nice clean functions. Lookup tables are
00684                                         # marginally faster, but require a lot of space.
00685                                         #
00686                                         if( $c >= UTF8_HANGUL_VBASE &&
00687                                                 $c <= UTF8_HANGUL_VEND &&
00688                                                 $startChar >= UTF8_HANGUL_LBASE &&
00689                                                 $startChar <= UTF8_HANGUL_LEND ) {
00690                                                 #
00691                                                 #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
00692                                                 #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
00693                                                 $lIndex = ord( $startChar[2] ) - 0x80;
00694                                                 $vIndex = ord( $c[2]         ) - 0xa1;
00695
00696                                                 $hangulPoint = UNICODE_HANGUL_FIRST +
00697                                                         UNICODE_HANGUL_TCOUNT *
00698                                                         (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
00699
00700                                                 # Hardcode the limited-range UTF-8 conversion:
00701                                                 $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
00702                                                                          chr( $hangulPoint >>  6 & 0x3f | 0x80 ) .
00703                                                                          chr( $hangulPoint       & 0x3f | 0x80 );
00704                                                 $lastHangul = 0;
00705                                                 continue;
00706                                         } elseif( $c >= UTF8_HANGUL_TBASE &&
00707                                                           $c <= UTF8_HANGUL_TEND &&
00708                                                           $startChar >= UTF8_HANGUL_FIRST &&
00709                                                           $startChar <= UTF8_HANGUL_LAST &&
00710                                                           !$lastHangul ) {
00711                                                 # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
00712                                                 $tIndex = ord( $c[2] ) - 0xa7;
00713                                                 if( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + (0x11c0 - 0x11a7);
00714
00715                                                 # Increment the code point by $tIndex, without
00716                                                 # the function overhead of decoding and recoding UTF-8
00717                                                 #
00718                                                 $tail = ord( $startChar[2] ) + $tIndex;
00719                                                 if( $tail > 0xbf ) {
00720                                                         $tail -= 0x40;
00721                                                         $mid = ord( $startChar[1] ) + 1;
00722                                                         if( $mid > 0xbf ) {
00723                                                                 $startChar[0] = chr( ord( $startChar[0] ) + 1 );
00724                                                                 $mid -= 0x40;
00725                                                         }
00726                                                         $startChar[1] = chr( $mid );
00727                                                 }
00728                                                 $startChar[2] = chr( $tail );
00729
00730                                                 # If there's another jamo char after this, *don't* try to merge it.
00731                                                 $lastHangul = 1;
00732                                                 continue;
00733                                         }
00734                                 }
00735                         }
00736                         $out .= $startChar;
00737                         $out .= $combining;
00738                         $startChar = $c;
00739                         $combining = '';
00740                         $lastClass = 0;
00741                         $lastHangul = 0;
00742                 }
00743                 $out .= $startChar . $combining;
00744                 return $out;
00745         }
00746
00753         static function placebo( $string ) {
00754                 $len = strlen( $string );
00755                 $out = '';
00756                 for( $i = 0; $i < $len; $i++ ) {
00757                         $out .= $string[$i];
00758                 }
00759                 return $out;
00760         }
00768         private static function replaceForNativeNormalize( $string ) {
00769                 $string = preg_replace(
00770                         '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
00771                         UTF8_REPLACEMENT,
00772                         $string );
00773                 $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
00774                 $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
00775                 return $string;
00776         }
00777 }