MediaWiki  REL1_19
UtfNormal.php
Go to the documentation of this file.
00001 <?php
00031 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
00032 define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
00033 
00048 class UtfNormal {
00052         const UNORM_NONE = 1;
00053         const UNORM_NFD  = 2;
00054         const UNORM_NFKD = 3;
00055         const UNORM_NFC  = 4;
00056         const UNORM_NFKC = 5;
00057         const UNORM_FCD  = 6;
00058         const UNORM_DEFAULT = self::UNORM_NFC;
00059 
00060         static $utfCombiningClass = null;
00061         static $utfCanonicalComp = null;
00062         static $utfCanonicalDecomp = null;
00063 
00064         # Load compatibility decompositions on demand if they are needed.
00065         static $utfCompatibilityDecomp = null;
00066 
00067         static $utfCheckNFC;
00068 
00079         static function cleanUp( $string ) {
00080                 if( NORMALIZE_ICU ) {
00081                         $string = self::replaceForNativeNormalize( $string );
00082 
00083                         # UnicodeString constructor fails if the string ends with a
00084                         # head byte. Add a junk char at the end, we'll strip it off.
00085                         return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" );
00086                 } elseif( NORMALIZE_INTL ) {
00087                         $string = self::replaceForNativeNormalize( $string );
00088                         $norm = normalizer_normalize( $string, Normalizer::FORM_C );
00089                         if( $norm === null || $norm === false ) {
00090                                 # normalizer_normalize will either return false or null
00091                                 # (depending on which doc you read) if invalid utf8 string.
00092                                 # quickIsNFCVerify cleans up invalid sequences.
00093 
00094                                 if( UtfNormal::quickIsNFCVerify( $string ) ) {
00095                                         # if that's true, the string is actually already normal.
00096                                         return $string;
00097                                 } else {
00098                                         # Now we are valid but non-normal
00099                                         return normalizer_normalize( $string, Normalizer::FORM_C );
00100                                 }
00101                         } else {
00102                                 return $norm;
00103                         }
00104                 } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
00105                         # Side effect -- $string has had UTF-8 errors cleaned up.
00106                         return $string;
00107                 } else {
00108                         return UtfNormal::NFC( $string );
00109                 }
00110         }
00111 
00120         static function toNFC( $string ) {
00121                 if( NORMALIZE_INTL )
00122                         return normalizer_normalize( $string, Normalizer::FORM_C );
00123                 elseif( NORMALIZE_ICU )
00124                         return utf8_normalize( $string, self::UNORM_NFC );
00125                 elseif( UtfNormal::quickIsNFC( $string ) )
00126                         return $string;
00127                 else
00128                         return UtfNormal::NFC( $string );
00129         }
00130 
00138         static function toNFD( $string ) {
00139                 if( NORMALIZE_INTL )
00140                         return normalizer_normalize( $string, Normalizer::FORM_D );
00141                 elseif( NORMALIZE_ICU )
00142                         return utf8_normalize( $string, self::UNORM_NFD );
00143                 elseif( preg_match( '/[\x80-\xff]/', $string ) )
00144                         return UtfNormal::NFD( $string );
00145                 else
00146                         return $string;
00147         }
00148 
00157         static function toNFKC( $string ) {
00158                 if( NORMALIZE_INTL )
00159                         return normalizer_normalize( $string, Normalizer::FORM_KC );
00160                 elseif( NORMALIZE_ICU )
00161                         return utf8_normalize( $string, self::UNORM_NFKC );
00162                 elseif( preg_match( '/[\x80-\xff]/', $string ) )
00163                         return UtfNormal::NFKC( $string );
00164                 else
00165                         return $string;
00166         }
00167 
00176         static function toNFKD( $string ) {
00177                 if( NORMALIZE_INTL )
00178                         return normalizer_normalize( $string, Normalizer::FORM_KD );
00179                 elseif( NORMALIZE_ICU )
00180                         return utf8_normalize( $string, self::UNORM_NFKD );
00181                 elseif( preg_match( '/[\x80-\xff]/', $string ) )
00182                         return UtfNormal::NFKD( $string );
00183                 else
00184                         return $string;
00185         }
00186 
00191         static function loadData() {
00192                 if( !isset( self::$utfCombiningClass ) ) {
00193                         require_once( dirname(__FILE__) . '/UtfNormalData.inc' );
00194                 }
00195         }
00196 
00203         static function quickIsNFC( $string ) {
00204                 # ASCII is always valid NFC!
00205                 # If it's pure ASCII, let it through.
00206                 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
00207 
00208                 UtfNormal::loadData();
00209                 $len = strlen( $string );
00210                 for( $i = 0; $i < $len; $i++ ) {
00211                         $c = $string[$i];
00212                         $n = ord( $c );
00213                         if( $n < 0x80 ) {
00214                                 continue;
00215                         } elseif( $n >= 0xf0 ) {
00216                                 $c = substr( $string, $i, 4 );
00217                                 $i += 3;
00218                         } elseif( $n >= 0xe0 ) {
00219                                 $c = substr( $string, $i, 3 );
00220                                 $i += 2;
00221                         } elseif( $n >= 0xc0 ) {
00222                                 $c = substr( $string, $i, 2 );
00223                                 $i++;
00224                         }
00225                         if( isset( self::$utfCheckNFC[$c] ) ) {
00226                                 # If it's NO or MAYBE, bail and do the slow check.
00227                                 return false;
00228                         }
00229                         if( isset( self::$utfCombiningClass[$c] ) ) {
00230                                 # Combining character? We might have to do sorting, at least.
00231                                 return false;
00232                         }
00233                 }
00234                 return true;
00235         }
00236 
00242         static function quickIsNFCVerify( &$string ) {
00243                 # Screen out some characters that eg won't be allowed in XML
00244                 $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
00245 
00246                 # ASCII is always valid NFC!
00247                 # If we're only ever given plain ASCII, we can avoid the overhead
00248                 # of initializing the decomposition tables by skipping out early.
00249                 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
00250 
00251                 static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
00252                 if( !isset( $checkit ) ) {
00253                         # Load/build some scary lookup tables...
00254                         UtfNormal::loadData();
00255 
00256                         $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
00257 
00258                         # Head bytes for sequences which we should do further validity checks
00259                         $checkit = array_flip( array_map( 'chr',
00260                                         array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
00261                                                    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
00262                                                    0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
00263 
00264                         # Each UTF-8 head byte is followed by a certain
00265                         # number of tail bytes.
00266                         $tailBytes = array();
00267                         for( $n = 0; $n < 256; $n++ ) {
00268                                 if( $n < 0xc0 ) {
00269                                         $remaining = 0;
00270                                 } elseif( $n < 0xe0 ) {
00271                                         $remaining = 1;
00272                                 } elseif( $n < 0xf0 ) {
00273                                         $remaining = 2;
00274                                 } elseif( $n < 0xf8 ) {
00275                                         $remaining = 3;
00276                                 } elseif( $n < 0xfc ) {
00277                                         $remaining = 4;
00278                                 } elseif( $n < 0xfe ) {
00279                                         $remaining = 5;
00280                                 } else {
00281                                         $remaining = 0;
00282                                 }
00283                                 $tailBytes[chr($n)] = $remaining;
00284                         }
00285                 }
00286 
00287                 # Chop the text into pure-ASCII and non-ASCII areas;
00288                 # large ASCII parts can be handled much more quickly.
00289                 # Don't chop up Unicode areas for punctuation, though,
00290                 # that wastes energy.
00291                 $matches = array();
00292                 preg_match_all(
00293                         '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
00294                         $string, $matches );
00295 
00296                 $looksNormal = true;
00297                 $base = 0;
00298                 $replace = array();
00299                 foreach( $matches[1] as $str ) {
00300                         $chunk = strlen( $str );
00301 
00302                         if( $str[0] < "\x80" ) {
00303                                 # ASCII chunk: guaranteed to be valid UTF-8
00304                                 # and in normal form C, so skip over it.
00305                                 $base += $chunk;
00306                                 continue;
00307                         }
00308 
00309                         # We'll have to examine the chunk byte by byte to ensure
00310                         # that it consists of valid UTF-8 sequences, and to see
00311                         # if any of them might not be normalized.
00312                         #
00313                         # Since PHP is not the fastest language on earth, some of
00314                         # this code is a little ugly with inner loop optimizations.
00315 
00316                         $head = '';
00317                         $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
00318 
00319                         for( $i = -1; --$len; ) {
00320                                 $remaining = $tailBytes[$c = $str[++$i]];
00321                                 if( $remaining ) {
00322                                         # UTF-8 head byte!
00323                                         $sequence = $head = $c;
00324                                         do {
00325                                                 # Look for the defined number of tail bytes...
00326                                                 if( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
00327                                                         # Legal tail bytes are nice.
00328                                                         $sequence .= $c;
00329                                                 } else {
00330                                                         if( 0 == $len ) {
00331                                                                 # Premature end of string!
00332                                                                 # Drop a replacement character into output to
00333                                                                 # represent the invalid UTF-8 sequence.
00334                                                                 $replace[] = array( UTF8_REPLACEMENT,
00335                                                                                                         $base + $i + 1 - strlen( $sequence ),
00336                                                                                                         strlen( $sequence ) );
00337                                                                 break 2;
00338                                                         } else {
00339                                                                 # Illegal tail byte; abandon the sequence.
00340                                                                 $replace[] = array( UTF8_REPLACEMENT,
00341                                                                                                         $base + $i - strlen( $sequence ),
00342                                                                                                         strlen( $sequence ) );
00343                                                                 # Back up and reprocess this byte; it may itself
00344                                                                 # be a legal ASCII or UTF-8 sequence head.
00345                                                                 --$i;
00346                                                                 ++$len;
00347                                                                 continue 2;
00348                                                         }
00349                                                 }
00350                                         } while( --$remaining );
00351 
00352                                         if( isset( $checkit[$head] ) ) {
00353                                                 # Do some more detailed validity checks, for
00354                                                 # invalid characters and illegal sequences.
00355                                                 if( $head == "\xed" ) {
00356                                                         # 0xed is relatively frequent in Korean, which
00357                                                         # abuts the surrogate area, so we're doing
00358                                                         # this check separately to speed things up.
00359 
00360                                                         if( $sequence >= UTF8_SURROGATE_FIRST ) {
00361                                                                 # Surrogates are legal only in UTF-16 code.
00362                                                                 # They are totally forbidden here in UTF-8
00363                                                                 # utopia.
00364                                                                 $replace[] = array( UTF8_REPLACEMENT,
00365                                                                              $base + $i + 1 - strlen( $sequence ),
00366                                                                              strlen( $sequence ) );
00367                                                                 $head = '';
00368                                                                 continue;
00369                                                         }
00370                                                 } else {
00371                                                         # Slower, but rarer checks...
00372                                                         $n = ord( $head );
00373                                                         if(
00374                                                                 # "Overlong sequences" are those that are syntactically
00375                                                                 # correct but use more UTF-8 bytes than are necessary to
00376                                                                 # encode a character. Naïve string comparisons can be
00377                                                                 # tricked into failing to see a match for an ASCII
00378                                                                 # character, for instance, which can be a security hole
00379                                                                 # if blacklist checks are being used.
00380                                                                ($n  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
00381                                                                 || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
00382                                                                 || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
00383 
00384                                                                 # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
00385                                                                 || ($n == 0xef &&
00386                                                                            ($sequence == UTF8_FFFE)
00387                                                                         || ($sequence == UTF8_FFFF) )
00388 
00389                                                                 # Unicode has been limited to 21 bits; longer
00390                                                                 # sequences are not allowed.
00391                                                                 || ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
00392 
00393                                                                 $replace[] = array( UTF8_REPLACEMENT,
00394                                                                                     $base + $i + 1 - strlen( $sequence ),
00395                                                                                     strlen( $sequence ) );
00396                                                                 $head = '';
00397                                                                 continue;
00398                                                         }
00399                                                 }
00400                                         }
00401 
00402                                         if( isset( $utfCheckOrCombining[$sequence] ) ) {
00403                                                 # If it's NO or MAYBE, we'll have to rip
00404                                                 # the string apart and put it back together.
00405                                                 # That's going to be mighty slow.
00406                                                 $looksNormal = false;
00407                                         }
00408 
00409                                         # The sequence is legal!
00410                                         $head = '';
00411                                 } elseif( $c < "\x80" ) {
00412                                         # ASCII byte.
00413                                         $head = '';
00414                                 } elseif( $c < "\xc0" ) {
00415                                         # Illegal tail bytes
00416                                         if( $head == '' ) {
00417                                                 # Out of the blue!
00418                                                 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
00419                                         } else {
00420                                                 # Don't add if we're continuing a broken sequence;
00421                                                 # we already put a replacement character when we looked
00422                                                 # at the broken sequence.
00423                                                 $replace[] = array( '', $base + $i, 1 );
00424                                         }
00425                                 } else {
00426                                         # Miscellaneous freaks.
00427                                         $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
00428                                         $head = '';
00429                                 }
00430                         }
00431                         $base += $chunk;
00432                 }
00433                 if( count( $replace ) ) {
00434                         # There were illegal UTF-8 sequences we need to fix up.
00435                         $out = '';
00436                         $last = 0;
00437                         foreach( $replace as $rep ) {
00438                                 list( $replacement, $start, $length ) = $rep;
00439                                 if( $last < $start ) {
00440                                         $out .= substr( $string, $last, $start - $last );
00441                                 }
00442                                 $out .= $replacement;
00443                                 $last = $start + $length;
00444                         }
00445                         if( $last < strlen( $string ) ) {
00446                                 $out .= substr( $string, $last );
00447                         }
00448                         $string = $out;
00449                 }
00450                 return $looksNormal;
00451         }
00452 
00453         # These take a string and run the normalization on them, without
00454         # checking for validity or any optimization etc. Input must be
00455         # VALID UTF-8!
00456 
00461         static function NFC( $string ) {
00462                 return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
00463         }
00464 
00470         static function NFD( $string ) {
00471                 UtfNormal::loadData();
00472 
00473                 return UtfNormal::fastCombiningSort(
00474                         UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) );
00475         }
00476 
00482         static function NFKC( $string ) {
00483                 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
00484         }
00485 
00491         static function NFKD( $string ) {
00492                 if( !isset( self::$utfCompatibilityDecomp ) ) {
00493                         require_once( 'UtfNormalDataK.inc' );
00494                 }
00495                 return self::fastCombiningSort(
00496                         self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
00497         }
00498 
00499 
00509         static function fastDecompose( $string, $map ) {
00510                 UtfNormal::loadData();
00511                 $len = strlen( $string );
00512                 $out = '';
00513                 for( $i = 0; $i < $len; $i++ ) {
00514                         $c = $string[$i];
00515                         $n = ord( $c );
00516                         if( $n < 0x80 ) {
00517                                 # ASCII chars never decompose
00518                                 # THEY ARE IMMORTAL
00519                                 $out .= $c;
00520                                 continue;
00521                         } elseif( $n >= 0xf0 ) {
00522                                 $c = substr( $string, $i, 4 );
00523                                 $i += 3;
00524                         } elseif( $n >= 0xe0 ) {
00525                                 $c = substr( $string, $i, 3 );
00526                                 $i += 2;
00527                         } elseif( $n >= 0xc0 ) {
00528                                 $c = substr( $string, $i, 2 );
00529                                 $i++;
00530                         }
00531                         if( isset( $map[$c] ) ) {
00532                                 $out .= $map[$c];
00533                                 continue;
00534                         } else {
00535                                 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
00536                                         # Decompose a hangul syllable into jamo;
00537                                         # hardcoded for three-byte UTF-8 sequence.
00538                                         # A lookup table would be slightly faster,
00539                                         # but adds a lot of memory & disk needs.
00540                                         #
00541                                         $index = ( (ord( $c[0] ) & 0x0f) << 12
00542                                                  | (ord( $c[1] ) & 0x3f) <<  6
00543                                                  | (ord( $c[2] ) & 0x3f) )
00544                                                - UNICODE_HANGUL_FIRST;
00545                                         $l = intval( $index / UNICODE_HANGUL_NCOUNT );
00546                                         $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
00547                                         $t = $index % UNICODE_HANGUL_TCOUNT;
00548                                         $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
00549                                         if( $t >= 25 ) {
00550                                                 $out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
00551                                         } elseif( $t ) {
00552                                                 $out .= "\xe1\x86" . chr( 0xa7 + $t );
00553                                         }
00554                                         continue;
00555                                 }
00556                         }
00557                         $out .= $c;
00558                 }
00559                 return $out;
00560         }
00561 
00569         static function fastCombiningSort( $string ) {
00570                 UtfNormal::loadData();
00571                 $len = strlen( $string );
00572                 $out = '';
00573                 $combiners = array();
00574                 $lastClass = -1;
00575                 for( $i = 0; $i < $len; $i++ ) {
00576                         $c = $string[$i];
00577                         $n = ord( $c );
00578                         if( $n >= 0x80 ) {
00579                                 if( $n >= 0xf0 ) {
00580                                         $c = substr( $string, $i, 4 );
00581                                         $i += 3;
00582                                 } elseif( $n >= 0xe0 ) {
00583                                         $c = substr( $string, $i, 3 );
00584                                         $i += 2;
00585                                 } elseif( $n >= 0xc0 ) {
00586                                         $c = substr( $string, $i, 2 );
00587                                         $i++;
00588                                 }
00589                                 if( isset( self::$utfCombiningClass[$c] ) ) {
00590                                         $lastClass = self::$utfCombiningClass[$c];
00591                                         if( isset( $combiners[$lastClass] ) ) {
00592                                                 $combiners[$lastClass] .= $c;
00593                                         } else {
00594                                                 $combiners[$lastClass] = $c;
00595                                         }
00596                                         continue;
00597                                 }
00598                         }
00599                         if( $lastClass ) {
00600                                 ksort( $combiners );
00601                                 $out .= implode( '', $combiners );
00602                                 $combiners = array();
00603                         }
00604                         $out .= $c;
00605                         $lastClass = 0;
00606                 }
00607                 if( $lastClass ) {
00608                         ksort( $combiners );
00609                         $out .= implode( '', $combiners );
00610                 }
00611                 return $out;
00612         }
00613 
00621         static function fastCompose( $string ) {
00622                 UtfNormal::loadData();
00623                 $len = strlen( $string );
00624                 $out = '';
00625                 $lastClass = -1;
00626                 $lastHangul = 0;
00627                 $startChar = '';
00628                 $combining = '';
00629                 $x1 = ord(substr(UTF8_HANGUL_VBASE,0,1));
00630                 $x2 = ord(substr(UTF8_HANGUL_TEND,0,1));
00631                 for( $i = 0; $i < $len; $i++ ) {
00632                         $c = $string[$i];
00633                         $n = ord( $c );
00634                         if( $n < 0x80 ) {
00635                                 # No combining characters here...
00636                                 $out .= $startChar;
00637                                 $out .= $combining;
00638                                 $startChar = $c;
00639                                 $combining = '';
00640                                 $lastClass = 0;
00641                                 continue;
00642                         } elseif( $n >= 0xf0 ) {
00643                                 $c = substr( $string, $i, 4 );
00644                                 $i += 3;
00645                         } elseif( $n >= 0xe0 ) {
00646                                 $c = substr( $string, $i, 3 );
00647                                 $i += 2;
00648                         } elseif( $n >= 0xc0 ) {
00649                                 $c = substr( $string, $i, 2 );
00650                                 $i++;
00651                         }
00652                         $pair = $startChar . $c;
00653                         if( $n > 0x80 ) {
00654                                 if( isset( self::$utfCombiningClass[$c] ) ) {
00655                                         # A combining char; see what we can do with it
00656                                         $class = self::$utfCombiningClass[$c];
00657                                         if( !empty( $startChar ) &&
00658                                                 $lastClass < $class &&
00659                                                 $class > 0 &&
00660                                                 isset( self::$utfCanonicalComp[$pair] ) ) {
00661                                                 $startChar = self::$utfCanonicalComp[$pair];
00662                                                 $class = 0;
00663                                         } else {
00664                                                 $combining .= $c;
00665                                         }
00666                                         $lastClass = $class;
00667                                         $lastHangul = 0;
00668                                         continue;
00669                                 }
00670                         }
00671                         # New start char
00672                         if( $lastClass == 0 ) {
00673                                 if( isset( self::$utfCanonicalComp[$pair] ) ) {
00674                                         $startChar = self::$utfCanonicalComp[$pair];
00675                                         $lastHangul = 0;
00676                                         continue;
00677                                 }
00678                                 if( $n >= $x1 && $n <= $x2 ) {
00679                                         # WARNING: Hangul code is painfully slow.
00680                                         # I apologize for this ugly, ugly code; however
00681                                         # performance is even more teh suck if we call
00682                                         # out to nice clean functions. Lookup tables are
00683                                         # marginally faster, but require a lot of space.
00684                                         #
00685                                         if( $c >= UTF8_HANGUL_VBASE &&
00686                                                 $c <= UTF8_HANGUL_VEND &&
00687                                                 $startChar >= UTF8_HANGUL_LBASE &&
00688                                                 $startChar <= UTF8_HANGUL_LEND ) {
00689                                                 #
00690                                                 #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
00691                                                 #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
00692                                                 $lIndex = ord( $startChar[2] ) - 0x80;
00693                                                 $vIndex = ord( $c[2]         ) - 0xa1;
00694 
00695                                                 $hangulPoint = UNICODE_HANGUL_FIRST +
00696                                                         UNICODE_HANGUL_TCOUNT *
00697                                                         (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
00698 
00699                                                 # Hardcode the limited-range UTF-8 conversion:
00700                                                 $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
00701                                                                          chr( $hangulPoint >>  6 & 0x3f | 0x80 ) .
00702                                                                          chr( $hangulPoint       & 0x3f | 0x80 );
00703                                                 $lastHangul = 0;
00704                                                 continue;
00705                                         } elseif( $c >= UTF8_HANGUL_TBASE &&
00706                                                           $c <= UTF8_HANGUL_TEND &&
00707                                                           $startChar >= UTF8_HANGUL_FIRST &&
00708                                                           $startChar <= UTF8_HANGUL_LAST &&
00709                                                           !$lastHangul ) {
00710                                                 # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
00711                                                 $tIndex = ord( $c[2] ) - 0xa7;
00712                                                 if( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + (0x11c0 - 0x11a7);
00713 
00714                                                 # Increment the code point by $tIndex, without
00715                                                 # the function overhead of decoding and recoding UTF-8
00716                                                 #
00717                                                 $tail = ord( $startChar[2] ) + $tIndex;
00718                                                 if( $tail > 0xbf ) {
00719                                                         $tail -= 0x40;
00720                                                         $mid = ord( $startChar[1] ) + 1;
00721                                                         if( $mid > 0xbf ) {
00722                                                                 $startChar[0] = chr( ord( $startChar[0] ) + 1 );
00723                                                                 $mid -= 0x40;
00724                                                         }
00725                                                         $startChar[1] = chr( $mid );
00726                                                 }
00727                                                 $startChar[2] = chr( $tail );
00728 
00729                                                 # If there's another jamo char after this, *don't* try to merge it.
00730                                                 $lastHangul = 1;
00731                                                 continue;
00732                                         }
00733                                 }
00734                         }
00735                         $out .= $startChar;
00736                         $out .= $combining;
00737                         $startChar = $c;
00738                         $combining = '';
00739                         $lastClass = 0;
00740                         $lastHangul = 0;
00741                 }
00742                 $out .= $startChar . $combining;
00743                 return $out;
00744         }
00745 
00752         static function placebo( $string ) {
00753                 $len = strlen( $string );
00754                 $out = '';
00755                 for( $i = 0; $i < $len; $i++ ) {
00756                         $out .= $string[$i];
00757                 }
00758                 return $out;
00759         }
00767         private static function replaceForNativeNormalize( $string ) { 
00768                 $string = preg_replace(
00769                         '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
00770                         UTF8_REPLACEMENT,
00771                         $string );
00772                 $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
00773                 $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
00774                 return $string;
00775         }
00776 }