MediaWiki
REL1_22
|
00001 <?php 00031 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) ); 00032 define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) ); 00033 00048 class UtfNormal { 00052 const UNORM_NONE = 1; 00053 const UNORM_NFD = 2; 00054 const UNORM_NFKD = 3; 00055 const UNORM_NFC = 4; 00056 const UNORM_NFKC = 5; 00057 const UNORM_FCD = 6; 00058 const UNORM_DEFAULT = self::UNORM_NFC; 00059 00060 static $utfCombiningClass = null; 00061 static $utfCanonicalComp = null; 00062 static $utfCanonicalDecomp = null; 00063 00064 # Load compatibility decompositions on demand if they are needed. 00065 static $utfCompatibilityDecomp = null; 00066 00067 static $utfCheckNFC; 00068 00079 static function cleanUp( $string ) { 00080 if( NORMALIZE_ICU ) { 00081 $string = self::replaceForNativeNormalize( $string ); 00082 00083 # UnicodeString constructor fails if the string ends with a 00084 # head byte. Add a junk char at the end, we'll strip it off. 00085 return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" ); 00086 } elseif( NORMALIZE_INTL ) { 00087 $string = self::replaceForNativeNormalize( $string ); 00088 $norm = normalizer_normalize( $string, Normalizer::FORM_C ); 00089 if( $norm === null || $norm === false ) { 00090 # normalizer_normalize will either return false or null 00091 # (depending on which doc you read) if invalid utf8 string. 00092 # quickIsNFCVerify cleans up invalid sequences. 00093 00094 if( UtfNormal::quickIsNFCVerify( $string ) ) { 00095 # if that's true, the string is actually already normal. 00096 return $string; 00097 } else { 00098 # Now we are valid but non-normal 00099 return normalizer_normalize( $string, Normalizer::FORM_C ); 00100 } 00101 } else { 00102 return $norm; 00103 } 00104 } elseif( UtfNormal::quickIsNFCVerify( $string ) ) { 00105 # Side effect -- $string has had UTF-8 errors cleaned up. 00106 return $string; 00107 } else { 00108 return UtfNormal::NFC( $string ); 00109 } 00110 } 00111 00120 static function toNFC( $string ) { 00121 if( NORMALIZE_INTL ) 00122 return normalizer_normalize( $string, Normalizer::FORM_C ); 00123 elseif( NORMALIZE_ICU ) 00124 return utf8_normalize( $string, self::UNORM_NFC ); 00125 elseif( UtfNormal::quickIsNFC( $string ) ) 00126 return $string; 00127 else 00128 return UtfNormal::NFC( $string ); 00129 } 00130 00138 static function toNFD( $string ) { 00139 if( NORMALIZE_INTL ) 00140 return normalizer_normalize( $string, Normalizer::FORM_D ); 00141 elseif( NORMALIZE_ICU ) 00142 return utf8_normalize( $string, self::UNORM_NFD ); 00143 elseif( preg_match( '/[\x80-\xff]/', $string ) ) 00144 return UtfNormal::NFD( $string ); 00145 else 00146 return $string; 00147 } 00148 00157 static function toNFKC( $string ) { 00158 if( NORMALIZE_INTL ) 00159 return normalizer_normalize( $string, Normalizer::FORM_KC ); 00160 elseif( NORMALIZE_ICU ) 00161 return utf8_normalize( $string, self::UNORM_NFKC ); 00162 elseif( preg_match( '/[\x80-\xff]/', $string ) ) 00163 return UtfNormal::NFKC( $string ); 00164 else 00165 return $string; 00166 } 00167 00176 static function toNFKD( $string ) { 00177 if( NORMALIZE_INTL ) 00178 return normalizer_normalize( $string, Normalizer::FORM_KD ); 00179 elseif( NORMALIZE_ICU ) 00180 return utf8_normalize( $string, self::UNORM_NFKD ); 00181 elseif( preg_match( '/[\x80-\xff]/', $string ) ) 00182 return UtfNormal::NFKD( $string ); 00183 else 00184 return $string; 00185 } 00186 00191 static function loadData() { 00192 if( !isset( self::$utfCombiningClass ) ) { 00193 require_once __DIR__ . '/UtfNormalData.inc'; 00194 } 00195 } 00196 00203 static function quickIsNFC( $string ) { 00204 # ASCII is always valid NFC! 00205 # If it's pure ASCII, let it through. 00206 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true; 00207 00208 UtfNormal::loadData(); 00209 $len = strlen( $string ); 00210 for( $i = 0; $i < $len; $i++ ) { 00211 $c = $string[$i]; 00212 $n = ord( $c ); 00213 if( $n < 0x80 ) { 00214 continue; 00215 } elseif( $n >= 0xf0 ) { 00216 $c = substr( $string, $i, 4 ); 00217 $i += 3; 00218 } elseif( $n >= 0xe0 ) { 00219 $c = substr( $string, $i, 3 ); 00220 $i += 2; 00221 } elseif( $n >= 0xc0 ) { 00222 $c = substr( $string, $i, 2 ); 00223 $i++; 00224 } 00225 if( isset( self::$utfCheckNFC[$c] ) ) { 00226 # If it's NO or MAYBE, bail and do the slow check. 00227 return false; 00228 } 00229 if( isset( self::$utfCombiningClass[$c] ) ) { 00230 # Combining character? We might have to do sorting, at least. 00231 return false; 00232 } 00233 } 00234 return true; 00235 } 00236 00243 static function quickIsNFCVerify( &$string ) { 00244 # Screen out some characters that eg won't be allowed in XML 00245 $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string ); 00246 00247 # ASCII is always valid NFC! 00248 # If we're only ever given plain ASCII, we can avoid the overhead 00249 # of initializing the decomposition tables by skipping out early. 00250 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true; 00251 00252 static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null; 00253 if( !isset( $checkit ) ) { 00254 # Load/build some scary lookup tables... 00255 UtfNormal::loadData(); 00256 00257 $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass ); 00258 00259 # Head bytes for sequences which we should do further validity checks 00260 $checkit = array_flip( array_map( 'chr', 00261 array( 0xc0, 0xc1, 0xe0, 0xed, 0xef, 00262 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 00263 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) ); 00264 00265 # Each UTF-8 head byte is followed by a certain 00266 # number of tail bytes. 00267 $tailBytes = array(); 00268 for( $n = 0; $n < 256; $n++ ) { 00269 if( $n < 0xc0 ) { 00270 $remaining = 0; 00271 } elseif( $n < 0xe0 ) { 00272 $remaining = 1; 00273 } elseif( $n < 0xf0 ) { 00274 $remaining = 2; 00275 } elseif( $n < 0xf8 ) { 00276 $remaining = 3; 00277 } elseif( $n < 0xfc ) { 00278 $remaining = 4; 00279 } elseif( $n < 0xfe ) { 00280 $remaining = 5; 00281 } else { 00282 $remaining = 0; 00283 } 00284 $tailBytes[chr($n)] = $remaining; 00285 } 00286 } 00287 00288 # Chop the text into pure-ASCII and non-ASCII areas; 00289 # large ASCII parts can be handled much more quickly. 00290 # Don't chop up Unicode areas for punctuation, though, 00291 # that wastes energy. 00292 $matches = array(); 00293 preg_match_all( 00294 '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/', 00295 $string, $matches ); 00296 00297 $looksNormal = true; 00298 $base = 0; 00299 $replace = array(); 00300 foreach( $matches[1] as $str ) { 00301 $chunk = strlen( $str ); 00302 00303 if( $str[0] < "\x80" ) { 00304 # ASCII chunk: guaranteed to be valid UTF-8 00305 # and in normal form C, so skip over it. 00306 $base += $chunk; 00307 continue; 00308 } 00309 00310 # We'll have to examine the chunk byte by byte to ensure 00311 # that it consists of valid UTF-8 sequences, and to see 00312 # if any of them might not be normalized. 00313 # 00314 # Since PHP is not the fastest language on earth, some of 00315 # this code is a little ugly with inner loop optimizations. 00316 00317 $head = ''; 00318 $len = $chunk + 1; # Counting down is faster. I'm *so* sorry. 00319 00320 for( $i = -1; --$len; ) { 00321 $remaining = $tailBytes[$c = $str[++$i]]; 00322 if( $remaining ) { 00323 # UTF-8 head byte! 00324 $sequence = $head = $c; 00325 do { 00326 # Look for the defined number of tail bytes... 00327 if( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) { 00328 # Legal tail bytes are nice. 00329 $sequence .= $c; 00330 } else { 00331 if( 0 == $len ) { 00332 # Premature end of string! 00333 # Drop a replacement character into output to 00334 # represent the invalid UTF-8 sequence. 00335 $replace[] = array( UTF8_REPLACEMENT, 00336 $base + $i + 1 - strlen( $sequence ), 00337 strlen( $sequence ) ); 00338 break 2; 00339 } else { 00340 # Illegal tail byte; abandon the sequence. 00341 $replace[] = array( UTF8_REPLACEMENT, 00342 $base + $i - strlen( $sequence ), 00343 strlen( $sequence ) ); 00344 # Back up and reprocess this byte; it may itself 00345 # be a legal ASCII or UTF-8 sequence head. 00346 --$i; 00347 ++$len; 00348 continue 2; 00349 } 00350 } 00351 } while( --$remaining ); 00352 00353 if( isset( $checkit[$head] ) ) { 00354 # Do some more detailed validity checks, for 00355 # invalid characters and illegal sequences. 00356 if( $head == "\xed" ) { 00357 # 0xed is relatively frequent in Korean, which 00358 # abuts the surrogate area, so we're doing 00359 # this check separately to speed things up. 00360 00361 if( $sequence >= UTF8_SURROGATE_FIRST ) { 00362 # Surrogates are legal only in UTF-16 code. 00363 # They are totally forbidden here in UTF-8 00364 # utopia. 00365 $replace[] = array( UTF8_REPLACEMENT, 00366 $base + $i + 1 - strlen( $sequence ), 00367 strlen( $sequence ) ); 00368 $head = ''; 00369 continue; 00370 } 00371 } else { 00372 # Slower, but rarer checks... 00373 $n = ord( $head ); 00374 if( 00375 # "Overlong sequences" are those that are syntactically 00376 # correct but use more UTF-8 bytes than are necessary to 00377 # encode a character. Naïve string comparisons can be 00378 # tricked into failing to see a match for an ASCII 00379 # character, for instance, which can be a security hole 00380 # if blacklist checks are being used. 00381 ($n < 0xc2 && $sequence <= UTF8_OVERLONG_A) 00382 || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B) 00383 || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C) 00384 00385 # U+FFFE and U+FFFF are explicitly forbidden in Unicode. 00386 || ($n == 0xef && 00387 ($sequence == UTF8_FFFE) 00388 || ($sequence == UTF8_FFFF) ) 00389 00390 # Unicode has been limited to 21 bits; longer 00391 # sequences are not allowed. 00392 || ($n >= 0xf0 && $sequence > UTF8_MAX) ) { 00393 00394 $replace[] = array( UTF8_REPLACEMENT, 00395 $base + $i + 1 - strlen( $sequence ), 00396 strlen( $sequence ) ); 00397 $head = ''; 00398 continue; 00399 } 00400 } 00401 } 00402 00403 if( isset( $utfCheckOrCombining[$sequence] ) ) { 00404 # If it's NO or MAYBE, we'll have to rip 00405 # the string apart and put it back together. 00406 # That's going to be mighty slow. 00407 $looksNormal = false; 00408 } 00409 00410 # The sequence is legal! 00411 $head = ''; 00412 } elseif( $c < "\x80" ) { 00413 # ASCII byte. 00414 $head = ''; 00415 } elseif( $c < "\xc0" ) { 00416 # Illegal tail bytes 00417 if( $head == '' ) { 00418 # Out of the blue! 00419 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 ); 00420 } else { 00421 # Don't add if we're continuing a broken sequence; 00422 # we already put a replacement character when we looked 00423 # at the broken sequence. 00424 $replace[] = array( '', $base + $i, 1 ); 00425 } 00426 } else { 00427 # Miscellaneous freaks. 00428 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 ); 00429 $head = ''; 00430 } 00431 } 00432 $base += $chunk; 00433 } 00434 if( count( $replace ) ) { 00435 # There were illegal UTF-8 sequences we need to fix up. 00436 $out = ''; 00437 $last = 0; 00438 foreach( $replace as $rep ) { 00439 list( $replacement, $start, $length ) = $rep; 00440 if( $last < $start ) { 00441 $out .= substr( $string, $last, $start - $last ); 00442 } 00443 $out .= $replacement; 00444 $last = $start + $length; 00445 } 00446 if( $last < strlen( $string ) ) { 00447 $out .= substr( $string, $last ); 00448 } 00449 $string = $out; 00450 } 00451 return $looksNormal; 00452 } 00453 00454 # These take a string and run the normalization on them, without 00455 # checking for validity or any optimization etc. Input must be 00456 # VALID UTF-8! 00457 00462 static function NFC( $string ) { 00463 return UtfNormal::fastCompose( UtfNormal::NFD( $string ) ); 00464 } 00465 00471 static function NFD( $string ) { 00472 UtfNormal::loadData(); 00473 00474 return UtfNormal::fastCombiningSort( 00475 UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) ); 00476 } 00477 00483 static function NFKC( $string ) { 00484 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) ); 00485 } 00486 00492 static function NFKD( $string ) { 00493 if( !isset( self::$utfCompatibilityDecomp ) ) { 00494 require_once 'UtfNormalDataK.inc'; 00495 } 00496 return self::fastCombiningSort( 00497 self::fastDecompose( $string, self::$utfCompatibilityDecomp ) ); 00498 } 00499 00500 00510 static function fastDecompose( $string, $map ) { 00511 UtfNormal::loadData(); 00512 $len = strlen( $string ); 00513 $out = ''; 00514 for( $i = 0; $i < $len; $i++ ) { 00515 $c = $string[$i]; 00516 $n = ord( $c ); 00517 if( $n < 0x80 ) { 00518 # ASCII chars never decompose 00519 # THEY ARE IMMORTAL 00520 $out .= $c; 00521 continue; 00522 } elseif( $n >= 0xf0 ) { 00523 $c = substr( $string, $i, 4 ); 00524 $i += 3; 00525 } elseif( $n >= 0xe0 ) { 00526 $c = substr( $string, $i, 3 ); 00527 $i += 2; 00528 } elseif( $n >= 0xc0 ) { 00529 $c = substr( $string, $i, 2 ); 00530 $i++; 00531 } 00532 if( isset( $map[$c] ) ) { 00533 $out .= $map[$c]; 00534 continue; 00535 } else { 00536 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) { 00537 # Decompose a hangul syllable into jamo; 00538 # hardcoded for three-byte UTF-8 sequence. 00539 # A lookup table would be slightly faster, 00540 # but adds a lot of memory & disk needs. 00541 # 00542 $index = ( (ord( $c[0] ) & 0x0f) << 12 00543 | (ord( $c[1] ) & 0x3f) << 6 00544 | (ord( $c[2] ) & 0x3f) ) 00545 - UNICODE_HANGUL_FIRST; 00546 $l = intval( $index / UNICODE_HANGUL_NCOUNT ); 00547 $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT); 00548 $t = $index % UNICODE_HANGUL_TCOUNT; 00549 $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v ); 00550 if( $t >= 25 ) { 00551 $out .= "\xe1\x87" . chr( 0x80 + $t - 25 ); 00552 } elseif( $t ) { 00553 $out .= "\xe1\x86" . chr( 0xa7 + $t ); 00554 } 00555 continue; 00556 } 00557 } 00558 $out .= $c; 00559 } 00560 return $out; 00561 } 00562 00570 static function fastCombiningSort( $string ) { 00571 UtfNormal::loadData(); 00572 $len = strlen( $string ); 00573 $out = ''; 00574 $combiners = array(); 00575 $lastClass = -1; 00576 for( $i = 0; $i < $len; $i++ ) { 00577 $c = $string[$i]; 00578 $n = ord( $c ); 00579 if( $n >= 0x80 ) { 00580 if( $n >= 0xf0 ) { 00581 $c = substr( $string, $i, 4 ); 00582 $i += 3; 00583 } elseif( $n >= 0xe0 ) { 00584 $c = substr( $string, $i, 3 ); 00585 $i += 2; 00586 } elseif( $n >= 0xc0 ) { 00587 $c = substr( $string, $i, 2 ); 00588 $i++; 00589 } 00590 if( isset( self::$utfCombiningClass[$c] ) ) { 00591 $lastClass = self::$utfCombiningClass[$c]; 00592 if( isset( $combiners[$lastClass] ) ) { 00593 $combiners[$lastClass] .= $c; 00594 } else { 00595 $combiners[$lastClass] = $c; 00596 } 00597 continue; 00598 } 00599 } 00600 if( $lastClass ) { 00601 ksort( $combiners ); 00602 $out .= implode( '', $combiners ); 00603 $combiners = array(); 00604 } 00605 $out .= $c; 00606 $lastClass = 0; 00607 } 00608 if( $lastClass ) { 00609 ksort( $combiners ); 00610 $out .= implode( '', $combiners ); 00611 } 00612 return $out; 00613 } 00614 00622 static function fastCompose( $string ) { 00623 UtfNormal::loadData(); 00624 $len = strlen( $string ); 00625 $out = ''; 00626 $lastClass = -1; 00627 $lastHangul = 0; 00628 $startChar = ''; 00629 $combining = ''; 00630 $x1 = ord(substr(UTF8_HANGUL_VBASE, 0, 1)); 00631 $x2 = ord(substr(UTF8_HANGUL_TEND, 0, 1)); 00632 for( $i = 0; $i < $len; $i++ ) { 00633 $c = $string[$i]; 00634 $n = ord( $c ); 00635 if( $n < 0x80 ) { 00636 # No combining characters here... 00637 $out .= $startChar; 00638 $out .= $combining; 00639 $startChar = $c; 00640 $combining = ''; 00641 $lastClass = 0; 00642 continue; 00643 } elseif( $n >= 0xf0 ) { 00644 $c = substr( $string, $i, 4 ); 00645 $i += 3; 00646 } elseif( $n >= 0xe0 ) { 00647 $c = substr( $string, $i, 3 ); 00648 $i += 2; 00649 } elseif( $n >= 0xc0 ) { 00650 $c = substr( $string, $i, 2 ); 00651 $i++; 00652 } 00653 $pair = $startChar . $c; 00654 if( $n > 0x80 ) { 00655 if( isset( self::$utfCombiningClass[$c] ) ) { 00656 # A combining char; see what we can do with it 00657 $class = self::$utfCombiningClass[$c]; 00658 if( !empty( $startChar ) && 00659 $lastClass < $class && 00660 $class > 0 && 00661 isset( self::$utfCanonicalComp[$pair] ) ) { 00662 $startChar = self::$utfCanonicalComp[$pair]; 00663 $class = 0; 00664 } else { 00665 $combining .= $c; 00666 } 00667 $lastClass = $class; 00668 $lastHangul = 0; 00669 continue; 00670 } 00671 } 00672 # New start char 00673 if( $lastClass == 0 ) { 00674 if( isset( self::$utfCanonicalComp[$pair] ) ) { 00675 $startChar = self::$utfCanonicalComp[$pair]; 00676 $lastHangul = 0; 00677 continue; 00678 } 00679 if( $n >= $x1 && $n <= $x2 ) { 00680 # WARNING: Hangul code is painfully slow. 00681 # I apologize for this ugly, ugly code; however 00682 # performance is even more teh suck if we call 00683 # out to nice clean functions. Lookup tables are 00684 # marginally faster, but require a lot of space. 00685 # 00686 if( $c >= UTF8_HANGUL_VBASE && 00687 $c <= UTF8_HANGUL_VEND && 00688 $startChar >= UTF8_HANGUL_LBASE && 00689 $startChar <= UTF8_HANGUL_LEND ) { 00690 # 00691 #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE; 00692 #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE; 00693 $lIndex = ord( $startChar[2] ) - 0x80; 00694 $vIndex = ord( $c[2] ) - 0xa1; 00695 00696 $hangulPoint = UNICODE_HANGUL_FIRST + 00697 UNICODE_HANGUL_TCOUNT * 00698 (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex); 00699 00700 # Hardcode the limited-range UTF-8 conversion: 00701 $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) . 00702 chr( $hangulPoint >> 6 & 0x3f | 0x80 ) . 00703 chr( $hangulPoint & 0x3f | 0x80 ); 00704 $lastHangul = 0; 00705 continue; 00706 } elseif( $c >= UTF8_HANGUL_TBASE && 00707 $c <= UTF8_HANGUL_TEND && 00708 $startChar >= UTF8_HANGUL_FIRST && 00709 $startChar <= UTF8_HANGUL_LAST && 00710 !$lastHangul ) { 00711 # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; 00712 $tIndex = ord( $c[2] ) - 0xa7; 00713 if( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + (0x11c0 - 0x11a7); 00714 00715 # Increment the code point by $tIndex, without 00716 # the function overhead of decoding and recoding UTF-8 00717 # 00718 $tail = ord( $startChar[2] ) + $tIndex; 00719 if( $tail > 0xbf ) { 00720 $tail -= 0x40; 00721 $mid = ord( $startChar[1] ) + 1; 00722 if( $mid > 0xbf ) { 00723 $startChar[0] = chr( ord( $startChar[0] ) + 1 ); 00724 $mid -= 0x40; 00725 } 00726 $startChar[1] = chr( $mid ); 00727 } 00728 $startChar[2] = chr( $tail ); 00729 00730 # If there's another jamo char after this, *don't* try to merge it. 00731 $lastHangul = 1; 00732 continue; 00733 } 00734 } 00735 } 00736 $out .= $startChar; 00737 $out .= $combining; 00738 $startChar = $c; 00739 $combining = ''; 00740 $lastClass = 0; 00741 $lastHangul = 0; 00742 } 00743 $out .= $startChar . $combining; 00744 return $out; 00745 } 00746 00753 static function placebo( $string ) { 00754 $len = strlen( $string ); 00755 $out = ''; 00756 for( $i = 0; $i < $len; $i++ ) { 00757 $out .= $string[$i]; 00758 } 00759 return $out; 00760 } 00768 private static function replaceForNativeNormalize( $string ) { 00769 $string = preg_replace( 00770 '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', 00771 UTF8_REPLACEMENT, 00772 $string ); 00773 $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string ); 00774 $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string ); 00775 return $string; 00776 } 00777 }