php/html/UtfNormal_8php_source.html

00001 <?php
00031 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
00032 define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
00033
00048 class UtfNormal {
00052     const UNORM_NONE = 1;
00053     const UNORM_NFD  = 2;
00054     const UNORM_NFKD = 3;
00055     const UNORM_NFC  = 4;
00056     const UNORM_NFKC = 5;
00057     const UNORM_FCD  = 6;
00058     const UNORM_DEFAULT = self::UNORM_NFC;
00059
00060     static $utfCombiningClass = null;
00061     static $utfCanonicalComp = null;
00062     static $utfCanonicalDecomp = null;
00063
00064     # Load compatibility decompositions on demand if they are needed.
00065     static $utfCompatibilityDecomp = null;
00066
00067     static $utfCheckNFC;
00068
00079     static function cleanUp( $string ) {
00080         if( NORMALIZE_ICU ) {
00081             $string = self::replaceForNativeNormalize( $string );
00082
00083             # UnicodeString constructor fails if the string ends with a
00084             # head byte. Add a junk char at the end, we'll strip it off.
00085             return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" );
00086         } elseif( NORMALIZE_INTL ) {
00087             $string = self::replaceForNativeNormalize( $string );
00088             $norm = normalizer_normalize( $string, Normalizer::FORM_C );
00089             if( $norm === null || $norm === false ) {
00090                 # normalizer_normalize will either return false or null
00091                 # (depending on which doc you read) if invalid utf8 string.
00092                 # quickIsNFCVerify cleans up invalid sequences.
00093
00094                 if( UtfNormal::quickIsNFCVerify( $string ) ) {
00095                     # if that's true, the string is actually already normal.
00096                     return $string;
00097                 } else {
00098                     # Now we are valid but non-normal
00099                     return normalizer_normalize( $string, Normalizer::FORM_C );
00100                 }
00101             } else {
00102                 return $norm;
00103             }
00104         } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
00105             # Side effect -- $string has had UTF-8 errors cleaned up.
00106             return $string;
00107         } else {
00108             return UtfNormal::NFC( $string );
00109         }
00110     }
00111
00120     static function toNFC( $string ) {
00121         if( NORMALIZE_INTL )
00122             return normalizer_normalize( $string, Normalizer::FORM_C );
00123         elseif( NORMALIZE_ICU )
00124             return utf8_normalize( $string, self::UNORM_NFC );
00125         elseif( UtfNormal::quickIsNFC( $string ) )
00126             return $string;
00127         else
00128             return UtfNormal::NFC( $string );
00129     }
00130
00138     static function toNFD( $string ) {
00139         if( NORMALIZE_INTL )
00140             return normalizer_normalize( $string, Normalizer::FORM_D );
00141         elseif( NORMALIZE_ICU )
00142             return utf8_normalize( $string, self::UNORM_NFD );
00143         elseif( preg_match( '/[\x80-\xff]/', $string ) )
00144             return UtfNormal::NFD( $string );
00145         else
00146             return $string;
00147     }
00148
00157     static function toNFKC( $string ) {
00158         if( NORMALIZE_INTL )
00159             return normalizer_normalize( $string, Normalizer::FORM_KC );
00160         elseif( NORMALIZE_ICU )
00161             return utf8_normalize( $string, self::UNORM_NFKC );
00162         elseif( preg_match( '/[\x80-\xff]/', $string ) )
00163             return UtfNormal::NFKC( $string );
00164         else
00165             return $string;
00166     }
00167
00176     static function toNFKD( $string ) {
00177         if( NORMALIZE_INTL )
00178             return normalizer_normalize( $string, Normalizer::FORM_KD );
00179         elseif( NORMALIZE_ICU )
00180             return utf8_normalize( $string, self::UNORM_NFKD );
00181         elseif( preg_match( '/[\x80-\xff]/', $string ) )
00182             return UtfNormal::NFKD( $string );
00183         else
00184             return $string;
00185     }
00186
00191     static function loadData() {
00192         if( !isset( self::$utfCombiningClass ) ) {
00193             require_once __DIR__ . '/UtfNormalData.inc';
00194         }
00195     }
00196
00203     static function quickIsNFC( $string ) {
00204         # ASCII is always valid NFC!
00205         # If it's pure ASCII, let it through.
00206         if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
00207
00208         UtfNormal::loadData();
00209         $len = strlen( $string );
00210         for( $i = 0; $i < $len; $i++ ) {
00211             $c = $string[$i];
00212             $n = ord( $c );
00213             if( $n < 0x80 ) {
00214                 continue;
00215             } elseif( $n >= 0xf0 ) {
00216                 $c = substr( $string, $i, 4 );
00217                 $i += 3;
00218             } elseif( $n >= 0xe0 ) {
00219                 $c = substr( $string, $i, 3 );
00220                 $i += 2;
00221             } elseif( $n >= 0xc0 ) {
00222                 $c = substr( $string, $i, 2 );
00223                 $i++;
00224             }
00225             if( isset( self::$utfCheckNFC[$c] ) ) {
00226                 # If it's NO or MAYBE, bail and do the slow check.
00227                 return false;
00228             }
00229             if( isset( self::$utfCombiningClass[$c] ) ) {
00230                 # Combining character? We might have to do sorting, at least.
00231                 return false;
00232             }
00233         }
00234         return true;
00235     }
00236
00243     static function quickIsNFCVerify( &$string ) {
00244         # Screen out some characters that eg won't be allowed in XML
00245         $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
00246
00247         # ASCII is always valid NFC!
00248         # If we're only ever given plain ASCII, we can avoid the overhead
00249         # of initializing the decomposition tables by skipping out early.
00250         if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
00251
00252         static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
00253         if( !isset( $checkit ) ) {
00254             # Load/build some scary lookup tables...
00255             UtfNormal::loadData();
00256
00257             $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
00258
00259             # Head bytes for sequences which we should do further validity checks
00260             $checkit = array_flip( array_map( 'chr',
00261                     array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
00262                            0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
00263                            0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
00264
00265             # Each UTF-8 head byte is followed by a certain
00266             # number of tail bytes.
00267             $tailBytes = array();
00268             for( $n = 0; $n < 256; $n++ ) {
00269                 if( $n < 0xc0 ) {
00270                     $remaining = 0;
00271                 } elseif( $n < 0xe0 ) {
00272                     $remaining = 1;
00273                 } elseif( $n < 0xf0 ) {
00274                     $remaining = 2;
00275                 } elseif( $n < 0xf8 ) {
00276                     $remaining = 3;
00277                 } elseif( $n < 0xfc ) {
00278                     $remaining = 4;
00279                 } elseif( $n < 0xfe ) {
00280                     $remaining = 5;
00281                 } else {
00282                     $remaining = 0;
00283                 }
00284                 $tailBytes[chr($n)] = $remaining;
00285             }
00286         }
00287
00288         # Chop the text into pure-ASCII and non-ASCII areas;
00289         # large ASCII parts can be handled much more quickly.
00290         # Don't chop up Unicode areas for punctuation, though,
00291         # that wastes energy.
00292         $matches = array();
00293         preg_match_all(
00294             '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
00295             $string, $matches );
00296
00297         $looksNormal = true;
00298         $base = 0;
00299         $replace = array();
00300         foreach( $matches[1] as $str ) {
00301             $chunk = strlen( $str );
00302
00303             if( $str[0] < "\x80" ) {
00304                 # ASCII chunk: guaranteed to be valid UTF-8
00305                 # and in normal form C, so skip over it.
00306                 $base += $chunk;
00307                 continue;
00308             }
00309
00310             # We'll have to examine the chunk byte by byte to ensure
00311             # that it consists of valid UTF-8 sequences, and to see
00312             # if any of them might not be normalized.
00313             #
00314             # Since PHP is not the fastest language on earth, some of
00315             # this code is a little ugly with inner loop optimizations.
00316
00317             $head = '';
00318             $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
00319
00320             for( $i = -1; --$len; ) {
00321                 $remaining = $tailBytes[$c = $str[++$i]];
00322                 if( $remaining ) {
00323                     # UTF-8 head byte!
00324                     $sequence = $head = $c;
00325                     do {
00326                         # Look for the defined number of tail bytes...
00327                         if( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
00328                             # Legal tail bytes are nice.
00329                             $sequence .= $c;
00330                         } else {
00331                             if( 0 == $len ) {
00332                                 # Premature end of string!
00333                                 # Drop a replacement character into output to
00334                                 # represent the invalid UTF-8 sequence.
00335                                 $replace[] = array( UTF8_REPLACEMENT,
00336                                                     $base + $i + 1 - strlen( $sequence ),
00337                                                     strlen( $sequence ) );
00338                                 break 2;
00339                             } else {
00340                                 # Illegal tail byte; abandon the sequence.
00341                                 $replace[] = array( UTF8_REPLACEMENT,
00342                                                     $base + $i - strlen( $sequence ),
00343                                                     strlen( $sequence ) );
00344                                 # Back up and reprocess this byte; it may itself
00345                                 # be a legal ASCII or UTF-8 sequence head.
00346                                 --$i;
00347                                 ++$len;
00348                                 continue 2;
00349                             }
00350                         }
00351                     } while( --$remaining );
00352
00353                     if( isset( $checkit[$head] ) ) {
00354                         # Do some more detailed validity checks, for
00355                         # invalid characters and illegal sequences.
00356                         if( $head == "\xed" ) {
00357                             # 0xed is relatively frequent in Korean, which
00358                             # abuts the surrogate area, so we're doing
00359                             # this check separately to speed things up.
00360
00361                             if( $sequence >= UTF8_SURROGATE_FIRST ) {
00362                                 # Surrogates are legal only in UTF-16 code.
00363                                 # They are totally forbidden here in UTF-8
00364                                 # utopia.
00365                                 $replace[] = array( UTF8_REPLACEMENT,
00366                                              $base + $i + 1 - strlen( $sequence ),
00367                                              strlen( $sequence ) );
00368                                 $head = '';
00369                                 continue;
00370                             }
00371                         } else {
00372                             # Slower, but rarer checks...
00373                             $n = ord( $head );
00374                             if(
00375                                 # "Overlong sequences" are those that are syntactically
00376                                 # correct but use more UTF-8 bytes than are necessary to
00377                                 # encode a character. Naïve string comparisons can be
00378                                 # tricked into failing to see a match for an ASCII
00379                                 # character, for instance, which can be a security hole
00380                                 # if blacklist checks are being used.
00381                                    ($n  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
00382                                 || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
00383                                 || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
00384
00385                                 # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
00386                                 || ($n == 0xef &&
00387                                        ($sequence == UTF8_FFFE)
00388                                     || ($sequence == UTF8_FFFF) )
00389
00390                                 # Unicode has been limited to 21 bits; longer
00391                                 # sequences are not allowed.
00392                                 || ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
00393
00394                                 $replace[] = array( UTF8_REPLACEMENT,
00395                                                     $base + $i + 1 - strlen( $sequence ),
00396                                                     strlen( $sequence ) );
00397                                 $head = '';
00398                                 continue;
00399                             }
00400                         }
00401                     }
00402
00403                     if( isset( $utfCheckOrCombining[$sequence] ) ) {
00404                         # If it's NO or MAYBE, we'll have to rip
00405                         # the string apart and put it back together.
00406                         # That's going to be mighty slow.
00407                         $looksNormal = false;
00408                     }
00409
00410                     # The sequence is legal!
00411                     $head = '';
00412                 } elseif( $c < "\x80" ) {
00413                     # ASCII byte.
00414                     $head = '';
00415                 } elseif( $c < "\xc0" ) {
00416                     # Illegal tail bytes
00417                     if( $head == '' ) {
00418                         # Out of the blue!
00419                         $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
00420                     } else {
00421                         # Don't add if we're continuing a broken sequence;
00422                         # we already put a replacement character when we looked
00423                         # at the broken sequence.
00424                         $replace[] = array( '', $base + $i, 1 );
00425                     }
00426                 } else {
00427                     # Miscellaneous freaks.
00428                     $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
00429                     $head = '';
00430                 }
00431             }
00432             $base += $chunk;
00433         }
00434         if( count( $replace ) ) {
00435             # There were illegal UTF-8 sequences we need to fix up.
00436             $out = '';
00437             $last = 0;
00438             foreach( $replace as $rep ) {
00439                 list( $replacement, $start, $length ) = $rep;
00440                 if( $last < $start ) {
00441                     $out .= substr( $string, $last, $start - $last );
00442                 }
00443                 $out .= $replacement;
00444                 $last = $start + $length;
00445             }
00446             if( $last < strlen( $string ) ) {
00447                 $out .= substr( $string, $last );
00448             }
00449             $string = $out;
00450         }
00451         return $looksNormal;
00452     }
00453
00454     # These take a string and run the normalization on them, without
00455     # checking for validity or any optimization etc. Input must be
00456     # VALID UTF-8!
00457
00462     static function NFC( $string ) {
00463         return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
00464     }
00465
00471     static function NFD( $string ) {
00472         UtfNormal::loadData();
00473
00474         return UtfNormal::fastCombiningSort(
00475             UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) );
00476     }
00477
00483     static function NFKC( $string ) {
00484         return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
00485     }
00486
00492     static function NFKD( $string ) {
00493         if( !isset( self::$utfCompatibilityDecomp ) ) {
00494             require_once 'UtfNormalDataK.inc';
00495         }
00496         return self::fastCombiningSort(
00497             self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
00498     }
00499
00500
00510     static function fastDecompose( $string, $map ) {
00511         UtfNormal::loadData();
00512         $len = strlen( $string );
00513         $out = '';
00514         for( $i = 0; $i < $len; $i++ ) {
00515             $c = $string[$i];
00516             $n = ord( $c );
00517             if( $n < 0x80 ) {
00518                 # ASCII chars never decompose
00519                 # THEY ARE IMMORTAL
00520                 $out .= $c;
00521                 continue;
00522             } elseif( $n >= 0xf0 ) {
00523                 $c = substr( $string, $i, 4 );
00524                 $i += 3;
00525             } elseif( $n >= 0xe0 ) {
00526                 $c = substr( $string, $i, 3 );
00527                 $i += 2;
00528             } elseif( $n >= 0xc0 ) {
00529                 $c = substr( $string, $i, 2 );
00530                 $i++;
00531             }
00532             if( isset( $map[$c] ) ) {
00533                 $out .= $map[$c];
00534                 continue;
00535             } else {
00536                 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
00537                     # Decompose a hangul syllable into jamo;
00538                     # hardcoded for three-byte UTF-8 sequence.
00539                     # A lookup table would be slightly faster,
00540                     # but adds a lot of memory & disk needs.
00541                     #
00542                     $index = ( (ord( $c[0] ) & 0x0f) << 12
00543                              | (ord( $c[1] ) & 0x3f) <<  6
00544                              | (ord( $c[2] ) & 0x3f) )
00545                            - UNICODE_HANGUL_FIRST;
00546                     $l = intval( $index / UNICODE_HANGUL_NCOUNT );
00547                     $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
00548                     $t = $index % UNICODE_HANGUL_TCOUNT;
00549                     $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
00550                     if( $t >= 25 ) {
00551                         $out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
00552                     } elseif( $t ) {
00553                         $out .= "\xe1\x86" . chr( 0xa7 + $t );
00554                     }
00555                     continue;
00556                 }
00557             }
00558             $out .= $c;
00559         }
00560         return $out;
00561     }
00562
00570     static function fastCombiningSort( $string ) {
00571         UtfNormal::loadData();
00572         $len = strlen( $string );
00573         $out = '';
00574         $combiners = array();
00575         $lastClass = -1;
00576         for( $i = 0; $i < $len; $i++ ) {
00577             $c = $string[$i];
00578             $n = ord( $c );
00579             if( $n >= 0x80 ) {
00580                 if( $n >= 0xf0 ) {
00581                     $c = substr( $string, $i, 4 );
00582                     $i += 3;
00583                 } elseif( $n >= 0xe0 ) {
00584                     $c = substr( $string, $i, 3 );
00585                     $i += 2;
00586                 } elseif( $n >= 0xc0 ) {
00587                     $c = substr( $string, $i, 2 );
00588                     $i++;
00589                 }
00590                 if( isset( self::$utfCombiningClass[$c] ) ) {
00591                     $lastClass = self::$utfCombiningClass[$c];
00592                     if( isset( $combiners[$lastClass] ) ) {
00593                         $combiners[$lastClass] .= $c;
00594                     } else {
00595                         $combiners[$lastClass] = $c;
00596                     }
00597                     continue;
00598                 }
00599             }
00600             if( $lastClass ) {
00601                 ksort( $combiners );
00602                 $out .= implode( '', $combiners );
00603                 $combiners = array();
00604             }
00605             $out .= $c;
00606             $lastClass = 0;
00607         }
00608         if( $lastClass ) {
00609             ksort( $combiners );
00610             $out .= implode( '', $combiners );
00611         }
00612         return $out;
00613     }
00614
00622     static function fastCompose( $string ) {
00623         UtfNormal::loadData();
00624         $len = strlen( $string );
00625         $out = '';
00626         $lastClass = -1;
00627         $lastHangul = 0;
00628         $startChar = '';
00629         $combining = '';
00630         $x1 = ord(substr(UTF8_HANGUL_VBASE, 0, 1));
00631         $x2 = ord(substr(UTF8_HANGUL_TEND, 0, 1));
00632         for( $i = 0; $i < $len; $i++ ) {
00633             $c = $string[$i];
00634             $n = ord( $c );
00635             if( $n < 0x80 ) {
00636                 # No combining characters here...
00637                 $out .= $startChar;
00638                 $out .= $combining;
00639                 $startChar = $c;
00640                 $combining = '';
00641                 $lastClass = 0;
00642                 continue;
00643             } elseif( $n >= 0xf0 ) {
00644                 $c = substr( $string, $i, 4 );
00645                 $i += 3;
00646             } elseif( $n >= 0xe0 ) {
00647                 $c = substr( $string, $i, 3 );
00648                 $i += 2;
00649             } elseif( $n >= 0xc0 ) {
00650                 $c = substr( $string, $i, 2 );
00651                 $i++;
00652             }
00653             $pair = $startChar . $c;
00654             if( $n > 0x80 ) {
00655                 if( isset( self::$utfCombiningClass[$c] ) ) {
00656                     # A combining char; see what we can do with it
00657                     $class = self::$utfCombiningClass[$c];
00658                     if( !empty( $startChar ) &&
00659                         $lastClass < $class &&
00660                         $class > 0 &&
00661                         isset( self::$utfCanonicalComp[$pair] ) ) {
00662                         $startChar = self::$utfCanonicalComp[$pair];
00663                         $class = 0;
00664                     } else {
00665                         $combining .= $c;
00666                     }
00667                     $lastClass = $class;
00668                     $lastHangul = 0;
00669                     continue;
00670                 }
00671             }
00672             # New start char
00673             if( $lastClass == 0 ) {
00674                 if( isset( self::$utfCanonicalComp[$pair] ) ) {
00675                     $startChar = self::$utfCanonicalComp[$pair];
00676                     $lastHangul = 0;
00677                     continue;
00678                 }
00679                 if( $n >= $x1 && $n <= $x2 ) {
00680                     # WARNING: Hangul code is painfully slow.
00681                     # I apologize for this ugly, ugly code; however
00682                     # performance is even more teh suck if we call
00683                     # out to nice clean functions. Lookup tables are
00684                     # marginally faster, but require a lot of space.
00685                     #
00686                     if( $c >= UTF8_HANGUL_VBASE &&
00687                         $c <= UTF8_HANGUL_VEND &&
00688                         $startChar >= UTF8_HANGUL_LBASE &&
00689                         $startChar <= UTF8_HANGUL_LEND ) {
00690                         #
00691                         #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
00692                         #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
00693                         $lIndex = ord( $startChar[2] ) - 0x80;
00694                         $vIndex = ord( $c[2]         ) - 0xa1;
00695
00696                         $hangulPoint = UNICODE_HANGUL_FIRST +
00697                             UNICODE_HANGUL_TCOUNT *
00698                             (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
00699
00700                         # Hardcode the limited-range UTF-8 conversion:
00701                         $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
00702                                      chr( $hangulPoint >>  6 & 0x3f | 0x80 ) .
00703                                      chr( $hangulPoint       & 0x3f | 0x80 );
00704                         $lastHangul = 0;
00705                         continue;
00706                     } elseif( $c >= UTF8_HANGUL_TBASE &&
00707                               $c <= UTF8_HANGUL_TEND &&
00708                               $startChar >= UTF8_HANGUL_FIRST &&
00709                               $startChar <= UTF8_HANGUL_LAST &&
00710                               !$lastHangul ) {
00711                         # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
00712                         $tIndex = ord( $c[2] ) - 0xa7;
00713                         if( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + (0x11c0 - 0x11a7);
00714
00715                         # Increment the code point by $tIndex, without
00716                         # the function overhead of decoding and recoding UTF-8
00717                         #
00718                         $tail = ord( $startChar[2] ) + $tIndex;
00719                         if( $tail > 0xbf ) {
00720                             $tail -= 0x40;
00721                             $mid = ord( $startChar[1] ) + 1;
00722                             if( $mid > 0xbf ) {
00723                                 $startChar[0] = chr( ord( $startChar[0] ) + 1 );
00724                                 $mid -= 0x40;
00725                             }
00726                             $startChar[1] = chr( $mid );
00727                         }
00728                         $startChar[2] = chr( $tail );
00729
00730                         # If there's another jamo char after this, *don't* try to merge it.
00731                         $lastHangul = 1;
00732                         continue;
00733                     }
00734                 }
00735             }
00736             $out .= $startChar;
00737             $out .= $combining;
00738             $startChar = $c;
00739             $combining = '';
00740             $lastClass = 0;
00741             $lastHangul = 0;
00742         }
00743         $out .= $startChar . $combining;
00744         return $out;
00745     }
00746
00753     static function placebo( $string ) {
00754         $len = strlen( $string );
00755         $out = '';
00756         for( $i = 0; $i < $len; $i++ ) {
00757             $out .= $string[$i];
00758         }
00759         return $out;
00760     }
00768     private static function replaceForNativeNormalize( $string ) {
00769         $string = preg_replace(
00770             '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
00771             UTF8_REPLACEMENT,
00772             $string );
00773         $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
00774         $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
00775         return $string;
00776     }
00777 }