MediaWiki  REL1_24
UtfNormal.php
Go to the documentation of this file.
00001 <?php
00031 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
00032 define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
00033 
00048 class UtfNormal {
00052     const UNORM_NONE = 1;
00053     const UNORM_NFD = 2;
00054     const UNORM_NFKD = 3;
00055     const UNORM_NFC = 4;
00056     const UNORM_NFKC = 5;
00057     const UNORM_FCD = 6;
00058     const UNORM_DEFAULT = self::UNORM_NFC;
00059 
00060     public static $utfCombiningClass = null;
00061     public static $utfCanonicalComp = null;
00062     public static $utfCanonicalDecomp = null;
00063 
00064     # Load compatibility decompositions on demand if they are needed.
00065     public static $utfCompatibilityDecomp = null;
00066     public static $utfCheckNFC;
00067 
00078     static function cleanUp( $string ) {
00079         if ( NORMALIZE_ICU ) {
00080             $string = self::replaceForNativeNormalize( $string );
00081 
00082             # UnicodeString constructor fails if the string ends with a
00083             # head byte. Add a junk char at the end, we'll strip it off.
00084             return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" );
00085         } elseif ( NORMALIZE_INTL ) {
00086             $string = self::replaceForNativeNormalize( $string );
00087             $norm = normalizer_normalize( $string, Normalizer::FORM_C );
00088             if ( $norm === null || $norm === false ) {
00089                 # normalizer_normalize will either return false or null
00090                 # (depending on which doc you read) if invalid utf8 string.
00091                 # quickIsNFCVerify cleans up invalid sequences.
00092 
00093                 if ( UtfNormal::quickIsNFCVerify( $string ) ) {
00094                     # if that's true, the string is actually already normal.
00095                     return $string;
00096                 } else {
00097                     # Now we are valid but non-normal
00098                     return normalizer_normalize( $string, Normalizer::FORM_C );
00099                 }
00100             } else {
00101                 return $norm;
00102             }
00103         } elseif ( UtfNormal::quickIsNFCVerify( $string ) ) {
00104             # Side effect -- $string has had UTF-8 errors cleaned up.
00105             return $string;
00106         } else {
00107             return UtfNormal::NFC( $string );
00108         }
00109     }
00110 
00119     static function toNFC( $string ) {
00120         if ( NORMALIZE_INTL )
00121             return normalizer_normalize( $string, Normalizer::FORM_C );
00122         elseif ( NORMALIZE_ICU )
00123             return utf8_normalize( $string, self::UNORM_NFC );
00124         elseif ( UtfNormal::quickIsNFC( $string ) )
00125             return $string;
00126         else
00127             return UtfNormal::NFC( $string );
00128     }
00129 
00137     static function toNFD( $string ) {
00138         if ( NORMALIZE_INTL )
00139             return normalizer_normalize( $string, Normalizer::FORM_D );
00140         elseif ( NORMALIZE_ICU )
00141             return utf8_normalize( $string, self::UNORM_NFD );
00142         elseif ( preg_match( '/[\x80-\xff]/', $string ) )
00143             return UtfNormal::NFD( $string );
00144         else
00145             return $string;
00146     }
00147 
00156     static function toNFKC( $string ) {
00157         if ( NORMALIZE_INTL )
00158             return normalizer_normalize( $string, Normalizer::FORM_KC );
00159         elseif ( NORMALIZE_ICU )
00160             return utf8_normalize( $string, self::UNORM_NFKC );
00161         elseif ( preg_match( '/[\x80-\xff]/', $string ) )
00162             return UtfNormal::NFKC( $string );
00163         else
00164             return $string;
00165     }
00166 
00175     static function toNFKD( $string ) {
00176         if ( NORMALIZE_INTL )
00177             return normalizer_normalize( $string, Normalizer::FORM_KD );
00178         elseif ( NORMALIZE_ICU )
00179             return utf8_normalize( $string, self::UNORM_NFKD );
00180         elseif ( preg_match( '/[\x80-\xff]/', $string ) )
00181             return UtfNormal::NFKD( $string );
00182         else
00183             return $string;
00184     }
00185 
00190     static function loadData() {
00191         if ( !isset( self::$utfCombiningClass ) ) {
00192             require_once __DIR__ . '/UtfNormalData.inc';
00193         }
00194     }
00195 
00202     static function quickIsNFC( $string ) {
00203         # ASCII is always valid NFC!
00204         # If it's pure ASCII, let it through.
00205         if ( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
00206 
00207         UtfNormal::loadData();
00208         $len = strlen( $string );
00209         for ( $i = 0; $i < $len; $i++ ) {
00210             $c = $string[$i];
00211             $n = ord( $c );
00212             if ( $n < 0x80 ) {
00213                 continue;
00214             } elseif ( $n >= 0xf0 ) {
00215                 $c = substr( $string, $i, 4 );
00216                 $i += 3;
00217             } elseif ( $n >= 0xe0 ) {
00218                 $c = substr( $string, $i, 3 );
00219                 $i += 2;
00220             } elseif ( $n >= 0xc0 ) {
00221                 $c = substr( $string, $i, 2 );
00222                 $i++;
00223             }
00224             if ( isset( self::$utfCheckNFC[$c] ) ) {
00225                 # If it's NO or MAYBE, bail and do the slow check.
00226                 return false;
00227             }
00228             if ( isset( self::$utfCombiningClass[$c] ) ) {
00229                 # Combining character? We might have to do sorting, at least.
00230                 return false;
00231             }
00232         }
00233 
00234         return true;
00235     }
00236 
00243     static function quickIsNFCVerify( &$string ) {
00244         # Screen out some characters that eg won't be allowed in XML
00245         $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
00246 
00247         # ASCII is always valid NFC!
00248         # If we're only ever given plain ASCII, we can avoid the overhead
00249         # of initializing the decomposition tables by skipping out early.
00250         if ( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
00251 
00252         static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
00253         if ( !isset( $checkit ) ) {
00254             # Load/build some scary lookup tables...
00255             UtfNormal::loadData();
00256 
00257             $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
00258 
00259             # Head bytes for sequences which we should do further validity checks
00260             $checkit = array_flip( array_map( 'chr',
00261                 array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
00262                     0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
00263                     0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
00264 
00265             # Each UTF-8 head byte is followed by a certain
00266             # number of tail bytes.
00267             $tailBytes = array();
00268             for ( $n = 0; $n < 256; $n++ ) {
00269                 if ( $n < 0xc0 ) {
00270                     $remaining = 0;
00271                 } elseif ( $n < 0xe0 ) {
00272                     $remaining = 1;
00273                 } elseif ( $n < 0xf0 ) {
00274                     $remaining = 2;
00275                 } elseif ( $n < 0xf8 ) {
00276                     $remaining = 3;
00277                 } elseif ( $n < 0xfc ) {
00278                     $remaining = 4;
00279                 } elseif ( $n < 0xfe ) {
00280                     $remaining = 5;
00281                 } else {
00282                     $remaining = 0;
00283                 }
00284                 $tailBytes[chr( $n )] = $remaining;
00285             }
00286         }
00287 
00288         # Chop the text into pure-ASCII and non-ASCII areas;
00289         # large ASCII parts can be handled much more quickly.
00290         # Don't chop up Unicode areas for punctuation, though,
00291         # that wastes energy.
00292         $matches = array();
00293         preg_match_all(
00294             '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
00295             $string, $matches );
00296 
00297         $looksNormal = true;
00298         $base = 0;
00299         $replace = array();
00300         foreach ( $matches[1] as $str ) {
00301             $chunk = strlen( $str );
00302 
00303             if ( $str[0] < "\x80" ) {
00304                 # ASCII chunk: guaranteed to be valid UTF-8
00305                 # and in normal form C, so skip over it.
00306                 $base += $chunk;
00307                 continue;
00308             }
00309 
00310             # We'll have to examine the chunk byte by byte to ensure
00311             # that it consists of valid UTF-8 sequences, and to see
00312             # if any of them might not be normalized.
00313             #
00314             # Since PHP is not the fastest language on earth, some of
00315             # this code is a little ugly with inner loop optimizations.
00316 
00317             $head = '';
00318             $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
00319 
00320             for ( $i = -1; --$len; ) {
00321                 $remaining = $tailBytes[$c = $str[++$i]];
00322                 if ( $remaining ) {
00323                     # UTF-8 head byte!
00324                     $sequence = $head = $c;
00325                     do {
00326                         # Look for the defined number of tail bytes...
00327                         if ( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
00328                             # Legal tail bytes are nice.
00329                             $sequence .= $c;
00330                         } else {
00331                             if ( 0 == $len ) {
00332                                 # Premature end of string!
00333                                 # Drop a replacement character into output to
00334                                 # represent the invalid UTF-8 sequence.
00335                                 $replace[] = array( UTF8_REPLACEMENT,
00336                                     $base + $i + 1 - strlen( $sequence ),
00337                                     strlen( $sequence ) );
00338                                 break 2;
00339                             } else {
00340                                 # Illegal tail byte; abandon the sequence.
00341                                 $replace[] = array( UTF8_REPLACEMENT,
00342                                     $base + $i - strlen( $sequence ),
00343                                     strlen( $sequence ) );
00344                                 # Back up and reprocess this byte; it may itself
00345                                 # be a legal ASCII or UTF-8 sequence head.
00346                                 --$i;
00347                                 ++$len;
00348                                 continue 2;
00349                             }
00350                         }
00351                     } while ( --$remaining );
00352 
00353                     if ( isset( $checkit[$head] ) ) {
00354                         # Do some more detailed validity checks, for
00355                         # invalid characters and illegal sequences.
00356                         if ( $head == "\xed" ) {
00357                             # 0xed is relatively frequent in Korean, which
00358                             # abuts the surrogate area, so we're doing
00359                             # this check separately to speed things up.
00360 
00361                             if ( $sequence >= UTF8_SURROGATE_FIRST ) {
00362                                 # Surrogates are legal only in UTF-16 code.
00363                                 # They are totally forbidden here in UTF-8
00364                                 # utopia.
00365                                 $replace[] = array( UTF8_REPLACEMENT,
00366                                     $base + $i + 1 - strlen( $sequence ),
00367                                     strlen( $sequence ) );
00368                                 $head = '';
00369                                 continue;
00370                             }
00371                         } else {
00372                             # Slower, but rarer checks...
00373                             $n = ord( $head );
00374                             if (
00375                                 # "Overlong sequences" are those that are syntactically
00376                                 # correct but use more UTF-8 bytes than are necessary to
00377                                 # encode a character. Naïve string comparisons can be
00378                                 # tricked into failing to see a match for an ASCII
00379                                 # character, for instance, which can be a security hole
00380                                 # if blacklist checks are being used.
00381                                 ( $n < 0xc2 && $sequence <= UTF8_OVERLONG_A )
00382                                 || ( $n == 0xe0 && $sequence <= UTF8_OVERLONG_B )
00383                                 || ( $n == 0xf0 && $sequence <= UTF8_OVERLONG_C )
00384 
00385                                 # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
00386                                 || ( $n == 0xef &&
00387                                     ( $sequence == UTF8_FFFE )
00388                                     || ( $sequence == UTF8_FFFF ) )
00389 
00390                                 # Unicode has been limited to 21 bits; longer
00391                                 # sequences are not allowed.
00392                                 || ( $n >= 0xf0 && $sequence > UTF8_MAX )
00393                             ) {
00394 
00395                                 $replace[] = array( UTF8_REPLACEMENT,
00396                                     $base + $i + 1 - strlen( $sequence ),
00397                                     strlen( $sequence ) );
00398                                 $head = '';
00399                                 continue;
00400                             }
00401                         }
00402                     }
00403 
00404                     if ( isset( $utfCheckOrCombining[$sequence] ) ) {
00405                         # If it's NO or MAYBE, we'll have to rip
00406                         # the string apart and put it back together.
00407                         # That's going to be mighty slow.
00408                         $looksNormal = false;
00409                     }
00410 
00411                     # The sequence is legal!
00412                     $head = '';
00413                 } elseif ( $c < "\x80" ) {
00414                     # ASCII byte.
00415                     $head = '';
00416                 } elseif ( $c < "\xc0" ) {
00417                     # Illegal tail bytes
00418                     if ( $head == '' ) {
00419                         # Out of the blue!
00420                         $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
00421                     } else {
00422                         # Don't add if we're continuing a broken sequence;
00423                         # we already put a replacement character when we looked
00424                         # at the broken sequence.
00425                         $replace[] = array( '', $base + $i, 1 );
00426                     }
00427                 } else {
00428                     # Miscellaneous freaks.
00429                     $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
00430                     $head = '';
00431                 }
00432             }
00433             $base += $chunk;
00434         }
00435         if ( count( $replace ) ) {
00436             # There were illegal UTF-8 sequences we need to fix up.
00437             $out = '';
00438             $last = 0;
00439             foreach ( $replace as $rep ) {
00440                 list( $replacement, $start, $length ) = $rep;
00441                 if ( $last < $start ) {
00442                     $out .= substr( $string, $last, $start - $last );
00443                 }
00444                 $out .= $replacement;
00445                 $last = $start + $length;
00446             }
00447             if ( $last < strlen( $string ) ) {
00448                 $out .= substr( $string, $last );
00449             }
00450             $string = $out;
00451         }
00452 
00453         return $looksNormal;
00454     }
00455 
00456     # These take a string and run the normalization on them, without
00457     # checking for validity or any optimization etc. Input must be
00458     # VALID UTF-8!
00459 
00464     static function NFC( $string ) {
00465         return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
00466     }
00467 
00473     static function NFD( $string ) {
00474         UtfNormal::loadData();
00475 
00476         return UtfNormal::fastCombiningSort(
00477             UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) );
00478     }
00479 
00485     static function NFKC( $string ) {
00486         return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
00487     }
00488 
00494     static function NFKD( $string ) {
00495         if ( !isset( self::$utfCompatibilityDecomp ) ) {
00496             require_once 'UtfNormalDataK.inc';
00497         }
00498 
00499         return self::fastCombiningSort(
00500             self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
00501     }
00502 
00512     static function fastDecompose( $string, $map ) {
00513         UtfNormal::loadData();
00514         $len = strlen( $string );
00515         $out = '';
00516         for ( $i = 0; $i < $len; $i++ ) {
00517             $c = $string[$i];
00518             $n = ord( $c );
00519             if ( $n < 0x80 ) {
00520                 # ASCII chars never decompose
00521                 # THEY ARE IMMORTAL
00522                 $out .= $c;
00523                 continue;
00524             } elseif ( $n >= 0xf0 ) {
00525                 $c = substr( $string, $i, 4 );
00526                 $i += 3;
00527             } elseif ( $n >= 0xe0 ) {
00528                 $c = substr( $string, $i, 3 );
00529                 $i += 2;
00530             } elseif ( $n >= 0xc0 ) {
00531                 $c = substr( $string, $i, 2 );
00532                 $i++;
00533             }
00534             if ( isset( $map[$c] ) ) {
00535                 $out .= $map[$c];
00536                 continue;
00537             } else {
00538                 if ( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
00539                     # Decompose a hangul syllable into jamo;
00540                     # hardcoded for three-byte UTF-8 sequence.
00541                     # A lookup table would be slightly faster,
00542                     # but adds a lot of memory & disk needs.
00543                     #
00544                     $index = ( ( ord( $c[0] ) & 0x0f ) << 12
00545                             | ( ord( $c[1] ) & 0x3f ) << 6
00546                             | ( ord( $c[2] ) & 0x3f ) )
00547                         - UNICODE_HANGUL_FIRST;
00548                     $l = intval( $index / UNICODE_HANGUL_NCOUNT );
00549                     $v = intval( ( $index % UNICODE_HANGUL_NCOUNT ) / UNICODE_HANGUL_TCOUNT );
00550                     $t = $index % UNICODE_HANGUL_TCOUNT;
00551                     $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
00552                     if ( $t >= 25 ) {
00553                         $out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
00554                     } elseif ( $t ) {
00555                         $out .= "\xe1\x86" . chr( 0xa7 + $t );
00556                     }
00557                     continue;
00558                 }
00559             }
00560             $out .= $c;
00561         }
00562 
00563         return $out;
00564     }
00565 
00573     static function fastCombiningSort( $string ) {
00574         UtfNormal::loadData();
00575         $len = strlen( $string );
00576         $out = '';
00577         $combiners = array();
00578         $lastClass = -1;
00579         for ( $i = 0; $i < $len; $i++ ) {
00580             $c = $string[$i];
00581             $n = ord( $c );
00582             if ( $n >= 0x80 ) {
00583                 if ( $n >= 0xf0 ) {
00584                     $c = substr( $string, $i, 4 );
00585                     $i += 3;
00586                 } elseif ( $n >= 0xe0 ) {
00587                     $c = substr( $string, $i, 3 );
00588                     $i += 2;
00589                 } elseif ( $n >= 0xc0 ) {
00590                     $c = substr( $string, $i, 2 );
00591                     $i++;
00592                 }
00593                 if ( isset( self::$utfCombiningClass[$c] ) ) {
00594                     $lastClass = self::$utfCombiningClass[$c];
00595                     if ( isset( $combiners[$lastClass] ) ) {
00596                         $combiners[$lastClass] .= $c;
00597                     } else {
00598                         $combiners[$lastClass] = $c;
00599                     }
00600                     continue;
00601                 }
00602             }
00603             if ( $lastClass ) {
00604                 ksort( $combiners );
00605                 $out .= implode( '', $combiners );
00606                 $combiners = array();
00607             }
00608             $out .= $c;
00609             $lastClass = 0;
00610         }
00611         if ( $lastClass ) {
00612             ksort( $combiners );
00613             $out .= implode( '', $combiners );
00614         }
00615 
00616         return $out;
00617     }
00618 
00628     static function fastCompose( $string ) {
00629         UtfNormal::loadData();
00630         $len = strlen( $string );
00631         $out = '';
00632         $lastClass = -1;
00633         $lastHangul = 0;
00634         $startChar = '';
00635         $combining = '';
00636         $x1 = ord( substr( UTF8_HANGUL_VBASE, 0, 1 ) );
00637         $x2 = ord( substr( UTF8_HANGUL_TEND, 0, 1 ) );
00638         for ( $i = 0; $i < $len; $i++ ) {
00639             $c = $string[$i];
00640             $n = ord( $c );
00641             if ( $n < 0x80 ) {
00642                 # No combining characters here...
00643                 $out .= $startChar;
00644                 $out .= $combining;
00645                 $startChar = $c;
00646                 $combining = '';
00647                 $lastClass = 0;
00648                 continue;
00649             } elseif ( $n >= 0xf0 ) {
00650                 $c = substr( $string, $i, 4 );
00651                 $i += 3;
00652             } elseif ( $n >= 0xe0 ) {
00653                 $c = substr( $string, $i, 3 );
00654                 $i += 2;
00655             } elseif ( $n >= 0xc0 ) {
00656                 $c = substr( $string, $i, 2 );
00657                 $i++;
00658             }
00659             $pair = $startChar . $c;
00660             if ( $n > 0x80 ) {
00661                 if ( isset( self::$utfCombiningClass[$c] ) ) {
00662                     # A combining char; see what we can do with it
00663                     $class = self::$utfCombiningClass[$c];
00664                     if ( !empty( $startChar ) &&
00665                         $lastClass < $class &&
00666                         $class > 0 &&
00667                         isset( self::$utfCanonicalComp[$pair] )
00668                     ) {
00669                         $startChar = self::$utfCanonicalComp[$pair];
00670                         $class = 0;
00671                     } else {
00672                         $combining .= $c;
00673                     }
00674                     $lastClass = $class;
00675                     $lastHangul = 0;
00676                     continue;
00677                 }
00678             }
00679             # New start char
00680             if ( $lastClass == 0 ) {
00681                 if ( isset( self::$utfCanonicalComp[$pair] ) ) {
00682                     $startChar = self::$utfCanonicalComp[$pair];
00683                     $lastHangul = 0;
00684                     continue;
00685                 }
00686                 if ( $n >= $x1 && $n <= $x2 ) {
00687                     # WARNING: Hangul code is painfully slow.
00688                     # I apologize for this ugly, ugly code; however
00689                     # performance is even more teh suck if we call
00690                     # out to nice clean functions. Lookup tables are
00691                     # marginally faster, but require a lot of space.
00692                     #
00693                     if ( $c >= UTF8_HANGUL_VBASE &&
00694                         $c <= UTF8_HANGUL_VEND &&
00695                         $startChar >= UTF8_HANGUL_LBASE &&
00696                         $startChar <= UTF8_HANGUL_LEND
00697                     ) {
00698                         #
00699                         #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
00700                         #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
00701                         $lIndex = ord( $startChar[2] ) - 0x80;
00702                         $vIndex = ord( $c[2] ) - 0xa1;
00703 
00704                         $hangulPoint = UNICODE_HANGUL_FIRST +
00705                             UNICODE_HANGUL_TCOUNT *
00706                             ( UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex );
00707 
00708                         # Hardcode the limited-range UTF-8 conversion:
00709                         $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
00710                             chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
00711                             chr( $hangulPoint & 0x3f | 0x80 );
00712                         $lastHangul = 0;
00713                         continue;
00714                     } elseif ( $c >= UTF8_HANGUL_TBASE &&
00715                         $c <= UTF8_HANGUL_TEND &&
00716                         $startChar >= UTF8_HANGUL_FIRST &&
00717                         $startChar <= UTF8_HANGUL_LAST &&
00718                         !$lastHangul
00719                     ) {
00720                         # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
00721                         $tIndex = ord( $c[2] ) - 0xa7;
00722                         if ( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + ( 0x11c0 - 0x11a7 );
00723 
00724                         # Increment the code point by $tIndex, without
00725                         # the function overhead of decoding and recoding UTF-8
00726                         #
00727                         $tail = ord( $startChar[2] ) + $tIndex;
00728                         if ( $tail > 0xbf ) {
00729                             $tail -= 0x40;
00730                             $mid = ord( $startChar[1] ) + 1;
00731                             if ( $mid > 0xbf ) {
00732                                 $startChar[0] = chr( ord( $startChar[0] ) + 1 );
00733                                 $mid -= 0x40;
00734                             }
00735                             $startChar[1] = chr( $mid );
00736                         }
00737                         $startChar[2] = chr( $tail );
00738 
00739                         # If there's another jamo char after this, *don't* try to merge it.
00740                         $lastHangul = 1;
00741                         continue;
00742                     }
00743                 }
00744             }
00745             $out .= $startChar;
00746             $out .= $combining;
00747             $startChar = $c;
00748             $combining = '';
00749             $lastClass = 0;
00750             $lastHangul = 0;
00751         }
00752         $out .= $startChar . $combining;
00753 
00754         return $out;
00755     }
00756 
00763     static function placebo( $string ) {
00764         $len = strlen( $string );
00765         $out = '';
00766         for ( $i = 0; $i < $len; $i++ ) {
00767             $out .= $string[$i];
00768         }
00769 
00770         return $out;
00771     }
00772 
00780     private static function replaceForNativeNormalize( $string ) {
00781         $string = preg_replace(
00782             '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
00783             UTF8_REPLACEMENT,
00784             $string );
00785         $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
00786         $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
00787 
00788         return $string;
00789     }
00790 }