MediaWiki
REL1_24
|
00001 <?php 00031 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) ); 00032 define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) ); 00033 00048 class UtfNormal { 00052 const UNORM_NONE = 1; 00053 const UNORM_NFD = 2; 00054 const UNORM_NFKD = 3; 00055 const UNORM_NFC = 4; 00056 const UNORM_NFKC = 5; 00057 const UNORM_FCD = 6; 00058 const UNORM_DEFAULT = self::UNORM_NFC; 00059 00060 public static $utfCombiningClass = null; 00061 public static $utfCanonicalComp = null; 00062 public static $utfCanonicalDecomp = null; 00063 00064 # Load compatibility decompositions on demand if they are needed. 00065 public static $utfCompatibilityDecomp = null; 00066 public static $utfCheckNFC; 00067 00078 static function cleanUp( $string ) { 00079 if ( NORMALIZE_ICU ) { 00080 $string = self::replaceForNativeNormalize( $string ); 00081 00082 # UnicodeString constructor fails if the string ends with a 00083 # head byte. Add a junk char at the end, we'll strip it off. 00084 return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" ); 00085 } elseif ( NORMALIZE_INTL ) { 00086 $string = self::replaceForNativeNormalize( $string ); 00087 $norm = normalizer_normalize( $string, Normalizer::FORM_C ); 00088 if ( $norm === null || $norm === false ) { 00089 # normalizer_normalize will either return false or null 00090 # (depending on which doc you read) if invalid utf8 string. 00091 # quickIsNFCVerify cleans up invalid sequences. 00092 00093 if ( UtfNormal::quickIsNFCVerify( $string ) ) { 00094 # if that's true, the string is actually already normal. 00095 return $string; 00096 } else { 00097 # Now we are valid but non-normal 00098 return normalizer_normalize( $string, Normalizer::FORM_C ); 00099 } 00100 } else { 00101 return $norm; 00102 } 00103 } elseif ( UtfNormal::quickIsNFCVerify( $string ) ) { 00104 # Side effect -- $string has had UTF-8 errors cleaned up. 00105 return $string; 00106 } else { 00107 return UtfNormal::NFC( $string ); 00108 } 00109 } 00110 00119 static function toNFC( $string ) { 00120 if ( NORMALIZE_INTL ) 00121 return normalizer_normalize( $string, Normalizer::FORM_C ); 00122 elseif ( NORMALIZE_ICU ) 00123 return utf8_normalize( $string, self::UNORM_NFC ); 00124 elseif ( UtfNormal::quickIsNFC( $string ) ) 00125 return $string; 00126 else 00127 return UtfNormal::NFC( $string ); 00128 } 00129 00137 static function toNFD( $string ) { 00138 if ( NORMALIZE_INTL ) 00139 return normalizer_normalize( $string, Normalizer::FORM_D ); 00140 elseif ( NORMALIZE_ICU ) 00141 return utf8_normalize( $string, self::UNORM_NFD ); 00142 elseif ( preg_match( '/[\x80-\xff]/', $string ) ) 00143 return UtfNormal::NFD( $string ); 00144 else 00145 return $string; 00146 } 00147 00156 static function toNFKC( $string ) { 00157 if ( NORMALIZE_INTL ) 00158 return normalizer_normalize( $string, Normalizer::FORM_KC ); 00159 elseif ( NORMALIZE_ICU ) 00160 return utf8_normalize( $string, self::UNORM_NFKC ); 00161 elseif ( preg_match( '/[\x80-\xff]/', $string ) ) 00162 return UtfNormal::NFKC( $string ); 00163 else 00164 return $string; 00165 } 00166 00175 static function toNFKD( $string ) { 00176 if ( NORMALIZE_INTL ) 00177 return normalizer_normalize( $string, Normalizer::FORM_KD ); 00178 elseif ( NORMALIZE_ICU ) 00179 return utf8_normalize( $string, self::UNORM_NFKD ); 00180 elseif ( preg_match( '/[\x80-\xff]/', $string ) ) 00181 return UtfNormal::NFKD( $string ); 00182 else 00183 return $string; 00184 } 00185 00190 static function loadData() { 00191 if ( !isset( self::$utfCombiningClass ) ) { 00192 require_once __DIR__ . '/UtfNormalData.inc'; 00193 } 00194 } 00195 00202 static function quickIsNFC( $string ) { 00203 # ASCII is always valid NFC! 00204 # If it's pure ASCII, let it through. 00205 if ( !preg_match( '/[\x80-\xff]/', $string ) ) return true; 00206 00207 UtfNormal::loadData(); 00208 $len = strlen( $string ); 00209 for ( $i = 0; $i < $len; $i++ ) { 00210 $c = $string[$i]; 00211 $n = ord( $c ); 00212 if ( $n < 0x80 ) { 00213 continue; 00214 } elseif ( $n >= 0xf0 ) { 00215 $c = substr( $string, $i, 4 ); 00216 $i += 3; 00217 } elseif ( $n >= 0xe0 ) { 00218 $c = substr( $string, $i, 3 ); 00219 $i += 2; 00220 } elseif ( $n >= 0xc0 ) { 00221 $c = substr( $string, $i, 2 ); 00222 $i++; 00223 } 00224 if ( isset( self::$utfCheckNFC[$c] ) ) { 00225 # If it's NO or MAYBE, bail and do the slow check. 00226 return false; 00227 } 00228 if ( isset( self::$utfCombiningClass[$c] ) ) { 00229 # Combining character? We might have to do sorting, at least. 00230 return false; 00231 } 00232 } 00233 00234 return true; 00235 } 00236 00243 static function quickIsNFCVerify( &$string ) { 00244 # Screen out some characters that eg won't be allowed in XML 00245 $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string ); 00246 00247 # ASCII is always valid NFC! 00248 # If we're only ever given plain ASCII, we can avoid the overhead 00249 # of initializing the decomposition tables by skipping out early. 00250 if ( !preg_match( '/[\x80-\xff]/', $string ) ) return true; 00251 00252 static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null; 00253 if ( !isset( $checkit ) ) { 00254 # Load/build some scary lookup tables... 00255 UtfNormal::loadData(); 00256 00257 $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass ); 00258 00259 # Head bytes for sequences which we should do further validity checks 00260 $checkit = array_flip( array_map( 'chr', 00261 array( 0xc0, 0xc1, 0xe0, 0xed, 0xef, 00262 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 00263 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) ); 00264 00265 # Each UTF-8 head byte is followed by a certain 00266 # number of tail bytes. 00267 $tailBytes = array(); 00268 for ( $n = 0; $n < 256; $n++ ) { 00269 if ( $n < 0xc0 ) { 00270 $remaining = 0; 00271 } elseif ( $n < 0xe0 ) { 00272 $remaining = 1; 00273 } elseif ( $n < 0xf0 ) { 00274 $remaining = 2; 00275 } elseif ( $n < 0xf8 ) { 00276 $remaining = 3; 00277 } elseif ( $n < 0xfc ) { 00278 $remaining = 4; 00279 } elseif ( $n < 0xfe ) { 00280 $remaining = 5; 00281 } else { 00282 $remaining = 0; 00283 } 00284 $tailBytes[chr( $n )] = $remaining; 00285 } 00286 } 00287 00288 # Chop the text into pure-ASCII and non-ASCII areas; 00289 # large ASCII parts can be handled much more quickly. 00290 # Don't chop up Unicode areas for punctuation, though, 00291 # that wastes energy. 00292 $matches = array(); 00293 preg_match_all( 00294 '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/', 00295 $string, $matches ); 00296 00297 $looksNormal = true; 00298 $base = 0; 00299 $replace = array(); 00300 foreach ( $matches[1] as $str ) { 00301 $chunk = strlen( $str ); 00302 00303 if ( $str[0] < "\x80" ) { 00304 # ASCII chunk: guaranteed to be valid UTF-8 00305 # and in normal form C, so skip over it. 00306 $base += $chunk; 00307 continue; 00308 } 00309 00310 # We'll have to examine the chunk byte by byte to ensure 00311 # that it consists of valid UTF-8 sequences, and to see 00312 # if any of them might not be normalized. 00313 # 00314 # Since PHP is not the fastest language on earth, some of 00315 # this code is a little ugly with inner loop optimizations. 00316 00317 $head = ''; 00318 $len = $chunk + 1; # Counting down is faster. I'm *so* sorry. 00319 00320 for ( $i = -1; --$len; ) { 00321 $remaining = $tailBytes[$c = $str[++$i]]; 00322 if ( $remaining ) { 00323 # UTF-8 head byte! 00324 $sequence = $head = $c; 00325 do { 00326 # Look for the defined number of tail bytes... 00327 if ( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) { 00328 # Legal tail bytes are nice. 00329 $sequence .= $c; 00330 } else { 00331 if ( 0 == $len ) { 00332 # Premature end of string! 00333 # Drop a replacement character into output to 00334 # represent the invalid UTF-8 sequence. 00335 $replace[] = array( UTF8_REPLACEMENT, 00336 $base + $i + 1 - strlen( $sequence ), 00337 strlen( $sequence ) ); 00338 break 2; 00339 } else { 00340 # Illegal tail byte; abandon the sequence. 00341 $replace[] = array( UTF8_REPLACEMENT, 00342 $base + $i - strlen( $sequence ), 00343 strlen( $sequence ) ); 00344 # Back up and reprocess this byte; it may itself 00345 # be a legal ASCII or UTF-8 sequence head. 00346 --$i; 00347 ++$len; 00348 continue 2; 00349 } 00350 } 00351 } while ( --$remaining ); 00352 00353 if ( isset( $checkit[$head] ) ) { 00354 # Do some more detailed validity checks, for 00355 # invalid characters and illegal sequences. 00356 if ( $head == "\xed" ) { 00357 # 0xed is relatively frequent in Korean, which 00358 # abuts the surrogate area, so we're doing 00359 # this check separately to speed things up. 00360 00361 if ( $sequence >= UTF8_SURROGATE_FIRST ) { 00362 # Surrogates are legal only in UTF-16 code. 00363 # They are totally forbidden here in UTF-8 00364 # utopia. 00365 $replace[] = array( UTF8_REPLACEMENT, 00366 $base + $i + 1 - strlen( $sequence ), 00367 strlen( $sequence ) ); 00368 $head = ''; 00369 continue; 00370 } 00371 } else { 00372 # Slower, but rarer checks... 00373 $n = ord( $head ); 00374 if ( 00375 # "Overlong sequences" are those that are syntactically 00376 # correct but use more UTF-8 bytes than are necessary to 00377 # encode a character. Naïve string comparisons can be 00378 # tricked into failing to see a match for an ASCII 00379 # character, for instance, which can be a security hole 00380 # if blacklist checks are being used. 00381 ( $n < 0xc2 && $sequence <= UTF8_OVERLONG_A ) 00382 || ( $n == 0xe0 && $sequence <= UTF8_OVERLONG_B ) 00383 || ( $n == 0xf0 && $sequence <= UTF8_OVERLONG_C ) 00384 00385 # U+FFFE and U+FFFF are explicitly forbidden in Unicode. 00386 || ( $n == 0xef && 00387 ( $sequence == UTF8_FFFE ) 00388 || ( $sequence == UTF8_FFFF ) ) 00389 00390 # Unicode has been limited to 21 bits; longer 00391 # sequences are not allowed. 00392 || ( $n >= 0xf0 && $sequence > UTF8_MAX ) 00393 ) { 00394 00395 $replace[] = array( UTF8_REPLACEMENT, 00396 $base + $i + 1 - strlen( $sequence ), 00397 strlen( $sequence ) ); 00398 $head = ''; 00399 continue; 00400 } 00401 } 00402 } 00403 00404 if ( isset( $utfCheckOrCombining[$sequence] ) ) { 00405 # If it's NO or MAYBE, we'll have to rip 00406 # the string apart and put it back together. 00407 # That's going to be mighty slow. 00408 $looksNormal = false; 00409 } 00410 00411 # The sequence is legal! 00412 $head = ''; 00413 } elseif ( $c < "\x80" ) { 00414 # ASCII byte. 00415 $head = ''; 00416 } elseif ( $c < "\xc0" ) { 00417 # Illegal tail bytes 00418 if ( $head == '' ) { 00419 # Out of the blue! 00420 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 ); 00421 } else { 00422 # Don't add if we're continuing a broken sequence; 00423 # we already put a replacement character when we looked 00424 # at the broken sequence. 00425 $replace[] = array( '', $base + $i, 1 ); 00426 } 00427 } else { 00428 # Miscellaneous freaks. 00429 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 ); 00430 $head = ''; 00431 } 00432 } 00433 $base += $chunk; 00434 } 00435 if ( count( $replace ) ) { 00436 # There were illegal UTF-8 sequences we need to fix up. 00437 $out = ''; 00438 $last = 0; 00439 foreach ( $replace as $rep ) { 00440 list( $replacement, $start, $length ) = $rep; 00441 if ( $last < $start ) { 00442 $out .= substr( $string, $last, $start - $last ); 00443 } 00444 $out .= $replacement; 00445 $last = $start + $length; 00446 } 00447 if ( $last < strlen( $string ) ) { 00448 $out .= substr( $string, $last ); 00449 } 00450 $string = $out; 00451 } 00452 00453 return $looksNormal; 00454 } 00455 00456 # These take a string and run the normalization on them, without 00457 # checking for validity or any optimization etc. Input must be 00458 # VALID UTF-8! 00459 00464 static function NFC( $string ) { 00465 return UtfNormal::fastCompose( UtfNormal::NFD( $string ) ); 00466 } 00467 00473 static function NFD( $string ) { 00474 UtfNormal::loadData(); 00475 00476 return UtfNormal::fastCombiningSort( 00477 UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) ); 00478 } 00479 00485 static function NFKC( $string ) { 00486 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) ); 00487 } 00488 00494 static function NFKD( $string ) { 00495 if ( !isset( self::$utfCompatibilityDecomp ) ) { 00496 require_once 'UtfNormalDataK.inc'; 00497 } 00498 00499 return self::fastCombiningSort( 00500 self::fastDecompose( $string, self::$utfCompatibilityDecomp ) ); 00501 } 00502 00512 static function fastDecompose( $string, $map ) { 00513 UtfNormal::loadData(); 00514 $len = strlen( $string ); 00515 $out = ''; 00516 for ( $i = 0; $i < $len; $i++ ) { 00517 $c = $string[$i]; 00518 $n = ord( $c ); 00519 if ( $n < 0x80 ) { 00520 # ASCII chars never decompose 00521 # THEY ARE IMMORTAL 00522 $out .= $c; 00523 continue; 00524 } elseif ( $n >= 0xf0 ) { 00525 $c = substr( $string, $i, 4 ); 00526 $i += 3; 00527 } elseif ( $n >= 0xe0 ) { 00528 $c = substr( $string, $i, 3 ); 00529 $i += 2; 00530 } elseif ( $n >= 0xc0 ) { 00531 $c = substr( $string, $i, 2 ); 00532 $i++; 00533 } 00534 if ( isset( $map[$c] ) ) { 00535 $out .= $map[$c]; 00536 continue; 00537 } else { 00538 if ( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) { 00539 # Decompose a hangul syllable into jamo; 00540 # hardcoded for three-byte UTF-8 sequence. 00541 # A lookup table would be slightly faster, 00542 # but adds a lot of memory & disk needs. 00543 # 00544 $index = ( ( ord( $c[0] ) & 0x0f ) << 12 00545 | ( ord( $c[1] ) & 0x3f ) << 6 00546 | ( ord( $c[2] ) & 0x3f ) ) 00547 - UNICODE_HANGUL_FIRST; 00548 $l = intval( $index / UNICODE_HANGUL_NCOUNT ); 00549 $v = intval( ( $index % UNICODE_HANGUL_NCOUNT ) / UNICODE_HANGUL_TCOUNT ); 00550 $t = $index % UNICODE_HANGUL_TCOUNT; 00551 $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v ); 00552 if ( $t >= 25 ) { 00553 $out .= "\xe1\x87" . chr( 0x80 + $t - 25 ); 00554 } elseif ( $t ) { 00555 $out .= "\xe1\x86" . chr( 0xa7 + $t ); 00556 } 00557 continue; 00558 } 00559 } 00560 $out .= $c; 00561 } 00562 00563 return $out; 00564 } 00565 00573 static function fastCombiningSort( $string ) { 00574 UtfNormal::loadData(); 00575 $len = strlen( $string ); 00576 $out = ''; 00577 $combiners = array(); 00578 $lastClass = -1; 00579 for ( $i = 0; $i < $len; $i++ ) { 00580 $c = $string[$i]; 00581 $n = ord( $c ); 00582 if ( $n >= 0x80 ) { 00583 if ( $n >= 0xf0 ) { 00584 $c = substr( $string, $i, 4 ); 00585 $i += 3; 00586 } elseif ( $n >= 0xe0 ) { 00587 $c = substr( $string, $i, 3 ); 00588 $i += 2; 00589 } elseif ( $n >= 0xc0 ) { 00590 $c = substr( $string, $i, 2 ); 00591 $i++; 00592 } 00593 if ( isset( self::$utfCombiningClass[$c] ) ) { 00594 $lastClass = self::$utfCombiningClass[$c]; 00595 if ( isset( $combiners[$lastClass] ) ) { 00596 $combiners[$lastClass] .= $c; 00597 } else { 00598 $combiners[$lastClass] = $c; 00599 } 00600 continue; 00601 } 00602 } 00603 if ( $lastClass ) { 00604 ksort( $combiners ); 00605 $out .= implode( '', $combiners ); 00606 $combiners = array(); 00607 } 00608 $out .= $c; 00609 $lastClass = 0; 00610 } 00611 if ( $lastClass ) { 00612 ksort( $combiners ); 00613 $out .= implode( '', $combiners ); 00614 } 00615 00616 return $out; 00617 } 00618 00628 static function fastCompose( $string ) { 00629 UtfNormal::loadData(); 00630 $len = strlen( $string ); 00631 $out = ''; 00632 $lastClass = -1; 00633 $lastHangul = 0; 00634 $startChar = ''; 00635 $combining = ''; 00636 $x1 = ord( substr( UTF8_HANGUL_VBASE, 0, 1 ) ); 00637 $x2 = ord( substr( UTF8_HANGUL_TEND, 0, 1 ) ); 00638 for ( $i = 0; $i < $len; $i++ ) { 00639 $c = $string[$i]; 00640 $n = ord( $c ); 00641 if ( $n < 0x80 ) { 00642 # No combining characters here... 00643 $out .= $startChar; 00644 $out .= $combining; 00645 $startChar = $c; 00646 $combining = ''; 00647 $lastClass = 0; 00648 continue; 00649 } elseif ( $n >= 0xf0 ) { 00650 $c = substr( $string, $i, 4 ); 00651 $i += 3; 00652 } elseif ( $n >= 0xe0 ) { 00653 $c = substr( $string, $i, 3 ); 00654 $i += 2; 00655 } elseif ( $n >= 0xc0 ) { 00656 $c = substr( $string, $i, 2 ); 00657 $i++; 00658 } 00659 $pair = $startChar . $c; 00660 if ( $n > 0x80 ) { 00661 if ( isset( self::$utfCombiningClass[$c] ) ) { 00662 # A combining char; see what we can do with it 00663 $class = self::$utfCombiningClass[$c]; 00664 if ( !empty( $startChar ) && 00665 $lastClass < $class && 00666 $class > 0 && 00667 isset( self::$utfCanonicalComp[$pair] ) 00668 ) { 00669 $startChar = self::$utfCanonicalComp[$pair]; 00670 $class = 0; 00671 } else { 00672 $combining .= $c; 00673 } 00674 $lastClass = $class; 00675 $lastHangul = 0; 00676 continue; 00677 } 00678 } 00679 # New start char 00680 if ( $lastClass == 0 ) { 00681 if ( isset( self::$utfCanonicalComp[$pair] ) ) { 00682 $startChar = self::$utfCanonicalComp[$pair]; 00683 $lastHangul = 0; 00684 continue; 00685 } 00686 if ( $n >= $x1 && $n <= $x2 ) { 00687 # WARNING: Hangul code is painfully slow. 00688 # I apologize for this ugly, ugly code; however 00689 # performance is even more teh suck if we call 00690 # out to nice clean functions. Lookup tables are 00691 # marginally faster, but require a lot of space. 00692 # 00693 if ( $c >= UTF8_HANGUL_VBASE && 00694 $c <= UTF8_HANGUL_VEND && 00695 $startChar >= UTF8_HANGUL_LBASE && 00696 $startChar <= UTF8_HANGUL_LEND 00697 ) { 00698 # 00699 #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE; 00700 #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE; 00701 $lIndex = ord( $startChar[2] ) - 0x80; 00702 $vIndex = ord( $c[2] ) - 0xa1; 00703 00704 $hangulPoint = UNICODE_HANGUL_FIRST + 00705 UNICODE_HANGUL_TCOUNT * 00706 ( UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex ); 00707 00708 # Hardcode the limited-range UTF-8 conversion: 00709 $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) . 00710 chr( $hangulPoint >> 6 & 0x3f | 0x80 ) . 00711 chr( $hangulPoint & 0x3f | 0x80 ); 00712 $lastHangul = 0; 00713 continue; 00714 } elseif ( $c >= UTF8_HANGUL_TBASE && 00715 $c <= UTF8_HANGUL_TEND && 00716 $startChar >= UTF8_HANGUL_FIRST && 00717 $startChar <= UTF8_HANGUL_LAST && 00718 !$lastHangul 00719 ) { 00720 # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; 00721 $tIndex = ord( $c[2] ) - 0xa7; 00722 if ( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + ( 0x11c0 - 0x11a7 ); 00723 00724 # Increment the code point by $tIndex, without 00725 # the function overhead of decoding and recoding UTF-8 00726 # 00727 $tail = ord( $startChar[2] ) + $tIndex; 00728 if ( $tail > 0xbf ) { 00729 $tail -= 0x40; 00730 $mid = ord( $startChar[1] ) + 1; 00731 if ( $mid > 0xbf ) { 00732 $startChar[0] = chr( ord( $startChar[0] ) + 1 ); 00733 $mid -= 0x40; 00734 } 00735 $startChar[1] = chr( $mid ); 00736 } 00737 $startChar[2] = chr( $tail ); 00738 00739 # If there's another jamo char after this, *don't* try to merge it. 00740 $lastHangul = 1; 00741 continue; 00742 } 00743 } 00744 } 00745 $out .= $startChar; 00746 $out .= $combining; 00747 $startChar = $c; 00748 $combining = ''; 00749 $lastClass = 0; 00750 $lastHangul = 0; 00751 } 00752 $out .= $startChar . $combining; 00753 00754 return $out; 00755 } 00756 00763 static function placebo( $string ) { 00764 $len = strlen( $string ); 00765 $out = ''; 00766 for ( $i = 0; $i < $len; $i++ ) { 00767 $out .= $string[$i]; 00768 } 00769 00770 return $out; 00771 } 00772 00780 private static function replaceForNativeNormalize( $string ) { 00781 $string = preg_replace( 00782 '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', 00783 UTF8_REPLACEMENT, 00784 $string ); 00785 $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string ); 00786 $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string ); 00787 00788 return $string; 00789 } 00790 }