MediaWiki
REL1_19
|
00001 <?php 00031 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) ); 00032 define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) ); 00033 00048 class UtfNormal { 00052 const UNORM_NONE = 1; 00053 const UNORM_NFD = 2; 00054 const UNORM_NFKD = 3; 00055 const UNORM_NFC = 4; 00056 const UNORM_NFKC = 5; 00057 const UNORM_FCD = 6; 00058 const UNORM_DEFAULT = self::UNORM_NFC; 00059 00060 static $utfCombiningClass = null; 00061 static $utfCanonicalComp = null; 00062 static $utfCanonicalDecomp = null; 00063 00064 # Load compatibility decompositions on demand if they are needed. 00065 static $utfCompatibilityDecomp = null; 00066 00067 static $utfCheckNFC; 00068 00079 static function cleanUp( $string ) { 00080 if( NORMALIZE_ICU ) { 00081 $string = self::replaceForNativeNormalize( $string ); 00082 00083 # UnicodeString constructor fails if the string ends with a 00084 # head byte. Add a junk char at the end, we'll strip it off. 00085 return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" ); 00086 } elseif( NORMALIZE_INTL ) { 00087 $string = self::replaceForNativeNormalize( $string ); 00088 $norm = normalizer_normalize( $string, Normalizer::FORM_C ); 00089 if( $norm === null || $norm === false ) { 00090 # normalizer_normalize will either return false or null 00091 # (depending on which doc you read) if invalid utf8 string. 00092 # quickIsNFCVerify cleans up invalid sequences. 00093 00094 if( UtfNormal::quickIsNFCVerify( $string ) ) { 00095 # if that's true, the string is actually already normal. 00096 return $string; 00097 } else { 00098 # Now we are valid but non-normal 00099 return normalizer_normalize( $string, Normalizer::FORM_C ); 00100 } 00101 } else { 00102 return $norm; 00103 } 00104 } elseif( UtfNormal::quickIsNFCVerify( $string ) ) { 00105 # Side effect -- $string has had UTF-8 errors cleaned up. 00106 return $string; 00107 } else { 00108 return UtfNormal::NFC( $string ); 00109 } 00110 } 00111 00120 static function toNFC( $string ) { 00121 if( NORMALIZE_INTL ) 00122 return normalizer_normalize( $string, Normalizer::FORM_C ); 00123 elseif( NORMALIZE_ICU ) 00124 return utf8_normalize( $string, self::UNORM_NFC ); 00125 elseif( UtfNormal::quickIsNFC( $string ) ) 00126 return $string; 00127 else 00128 return UtfNormal::NFC( $string ); 00129 } 00130 00138 static function toNFD( $string ) { 00139 if( NORMALIZE_INTL ) 00140 return normalizer_normalize( $string, Normalizer::FORM_D ); 00141 elseif( NORMALIZE_ICU ) 00142 return utf8_normalize( $string, self::UNORM_NFD ); 00143 elseif( preg_match( '/[\x80-\xff]/', $string ) ) 00144 return UtfNormal::NFD( $string ); 00145 else 00146 return $string; 00147 } 00148 00157 static function toNFKC( $string ) { 00158 if( NORMALIZE_INTL ) 00159 return normalizer_normalize( $string, Normalizer::FORM_KC ); 00160 elseif( NORMALIZE_ICU ) 00161 return utf8_normalize( $string, self::UNORM_NFKC ); 00162 elseif( preg_match( '/[\x80-\xff]/', $string ) ) 00163 return UtfNormal::NFKC( $string ); 00164 else 00165 return $string; 00166 } 00167 00176 static function toNFKD( $string ) { 00177 if( NORMALIZE_INTL ) 00178 return normalizer_normalize( $string, Normalizer::FORM_KD ); 00179 elseif( NORMALIZE_ICU ) 00180 return utf8_normalize( $string, self::UNORM_NFKD ); 00181 elseif( preg_match( '/[\x80-\xff]/', $string ) ) 00182 return UtfNormal::NFKD( $string ); 00183 else 00184 return $string; 00185 } 00186 00191 static function loadData() { 00192 if( !isset( self::$utfCombiningClass ) ) { 00193 require_once( dirname(__FILE__) . '/UtfNormalData.inc' ); 00194 } 00195 } 00196 00203 static function quickIsNFC( $string ) { 00204 # ASCII is always valid NFC! 00205 # If it's pure ASCII, let it through. 00206 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true; 00207 00208 UtfNormal::loadData(); 00209 $len = strlen( $string ); 00210 for( $i = 0; $i < $len; $i++ ) { 00211 $c = $string[$i]; 00212 $n = ord( $c ); 00213 if( $n < 0x80 ) { 00214 continue; 00215 } elseif( $n >= 0xf0 ) { 00216 $c = substr( $string, $i, 4 ); 00217 $i += 3; 00218 } elseif( $n >= 0xe0 ) { 00219 $c = substr( $string, $i, 3 ); 00220 $i += 2; 00221 } elseif( $n >= 0xc0 ) { 00222 $c = substr( $string, $i, 2 ); 00223 $i++; 00224 } 00225 if( isset( self::$utfCheckNFC[$c] ) ) { 00226 # If it's NO or MAYBE, bail and do the slow check. 00227 return false; 00228 } 00229 if( isset( self::$utfCombiningClass[$c] ) ) { 00230 # Combining character? We might have to do sorting, at least. 00231 return false; 00232 } 00233 } 00234 return true; 00235 } 00236 00242 static function quickIsNFCVerify( &$string ) { 00243 # Screen out some characters that eg won't be allowed in XML 00244 $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string ); 00245 00246 # ASCII is always valid NFC! 00247 # If we're only ever given plain ASCII, we can avoid the overhead 00248 # of initializing the decomposition tables by skipping out early. 00249 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true; 00250 00251 static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null; 00252 if( !isset( $checkit ) ) { 00253 # Load/build some scary lookup tables... 00254 UtfNormal::loadData(); 00255 00256 $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass ); 00257 00258 # Head bytes for sequences which we should do further validity checks 00259 $checkit = array_flip( array_map( 'chr', 00260 array( 0xc0, 0xc1, 0xe0, 0xed, 0xef, 00261 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 00262 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) ); 00263 00264 # Each UTF-8 head byte is followed by a certain 00265 # number of tail bytes. 00266 $tailBytes = array(); 00267 for( $n = 0; $n < 256; $n++ ) { 00268 if( $n < 0xc0 ) { 00269 $remaining = 0; 00270 } elseif( $n < 0xe0 ) { 00271 $remaining = 1; 00272 } elseif( $n < 0xf0 ) { 00273 $remaining = 2; 00274 } elseif( $n < 0xf8 ) { 00275 $remaining = 3; 00276 } elseif( $n < 0xfc ) { 00277 $remaining = 4; 00278 } elseif( $n < 0xfe ) { 00279 $remaining = 5; 00280 } else { 00281 $remaining = 0; 00282 } 00283 $tailBytes[chr($n)] = $remaining; 00284 } 00285 } 00286 00287 # Chop the text into pure-ASCII and non-ASCII areas; 00288 # large ASCII parts can be handled much more quickly. 00289 # Don't chop up Unicode areas for punctuation, though, 00290 # that wastes energy. 00291 $matches = array(); 00292 preg_match_all( 00293 '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/', 00294 $string, $matches ); 00295 00296 $looksNormal = true; 00297 $base = 0; 00298 $replace = array(); 00299 foreach( $matches[1] as $str ) { 00300 $chunk = strlen( $str ); 00301 00302 if( $str[0] < "\x80" ) { 00303 # ASCII chunk: guaranteed to be valid UTF-8 00304 # and in normal form C, so skip over it. 00305 $base += $chunk; 00306 continue; 00307 } 00308 00309 # We'll have to examine the chunk byte by byte to ensure 00310 # that it consists of valid UTF-8 sequences, and to see 00311 # if any of them might not be normalized. 00312 # 00313 # Since PHP is not the fastest language on earth, some of 00314 # this code is a little ugly with inner loop optimizations. 00315 00316 $head = ''; 00317 $len = $chunk + 1; # Counting down is faster. I'm *so* sorry. 00318 00319 for( $i = -1; --$len; ) { 00320 $remaining = $tailBytes[$c = $str[++$i]]; 00321 if( $remaining ) { 00322 # UTF-8 head byte! 00323 $sequence = $head = $c; 00324 do { 00325 # Look for the defined number of tail bytes... 00326 if( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) { 00327 # Legal tail bytes are nice. 00328 $sequence .= $c; 00329 } else { 00330 if( 0 == $len ) { 00331 # Premature end of string! 00332 # Drop a replacement character into output to 00333 # represent the invalid UTF-8 sequence. 00334 $replace[] = array( UTF8_REPLACEMENT, 00335 $base + $i + 1 - strlen( $sequence ), 00336 strlen( $sequence ) ); 00337 break 2; 00338 } else { 00339 # Illegal tail byte; abandon the sequence. 00340 $replace[] = array( UTF8_REPLACEMENT, 00341 $base + $i - strlen( $sequence ), 00342 strlen( $sequence ) ); 00343 # Back up and reprocess this byte; it may itself 00344 # be a legal ASCII or UTF-8 sequence head. 00345 --$i; 00346 ++$len; 00347 continue 2; 00348 } 00349 } 00350 } while( --$remaining ); 00351 00352 if( isset( $checkit[$head] ) ) { 00353 # Do some more detailed validity checks, for 00354 # invalid characters and illegal sequences. 00355 if( $head == "\xed" ) { 00356 # 0xed is relatively frequent in Korean, which 00357 # abuts the surrogate area, so we're doing 00358 # this check separately to speed things up. 00359 00360 if( $sequence >= UTF8_SURROGATE_FIRST ) { 00361 # Surrogates are legal only in UTF-16 code. 00362 # They are totally forbidden here in UTF-8 00363 # utopia. 00364 $replace[] = array( UTF8_REPLACEMENT, 00365 $base + $i + 1 - strlen( $sequence ), 00366 strlen( $sequence ) ); 00367 $head = ''; 00368 continue; 00369 } 00370 } else { 00371 # Slower, but rarer checks... 00372 $n = ord( $head ); 00373 if( 00374 # "Overlong sequences" are those that are syntactically 00375 # correct but use more UTF-8 bytes than are necessary to 00376 # encode a character. Naïve string comparisons can be 00377 # tricked into failing to see a match for an ASCII 00378 # character, for instance, which can be a security hole 00379 # if blacklist checks are being used. 00380 ($n < 0xc2 && $sequence <= UTF8_OVERLONG_A) 00381 || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B) 00382 || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C) 00383 00384 # U+FFFE and U+FFFF are explicitly forbidden in Unicode. 00385 || ($n == 0xef && 00386 ($sequence == UTF8_FFFE) 00387 || ($sequence == UTF8_FFFF) ) 00388 00389 # Unicode has been limited to 21 bits; longer 00390 # sequences are not allowed. 00391 || ($n >= 0xf0 && $sequence > UTF8_MAX) ) { 00392 00393 $replace[] = array( UTF8_REPLACEMENT, 00394 $base + $i + 1 - strlen( $sequence ), 00395 strlen( $sequence ) ); 00396 $head = ''; 00397 continue; 00398 } 00399 } 00400 } 00401 00402 if( isset( $utfCheckOrCombining[$sequence] ) ) { 00403 # If it's NO or MAYBE, we'll have to rip 00404 # the string apart and put it back together. 00405 # That's going to be mighty slow. 00406 $looksNormal = false; 00407 } 00408 00409 # The sequence is legal! 00410 $head = ''; 00411 } elseif( $c < "\x80" ) { 00412 # ASCII byte. 00413 $head = ''; 00414 } elseif( $c < "\xc0" ) { 00415 # Illegal tail bytes 00416 if( $head == '' ) { 00417 # Out of the blue! 00418 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 ); 00419 } else { 00420 # Don't add if we're continuing a broken sequence; 00421 # we already put a replacement character when we looked 00422 # at the broken sequence. 00423 $replace[] = array( '', $base + $i, 1 ); 00424 } 00425 } else { 00426 # Miscellaneous freaks. 00427 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 ); 00428 $head = ''; 00429 } 00430 } 00431 $base += $chunk; 00432 } 00433 if( count( $replace ) ) { 00434 # There were illegal UTF-8 sequences we need to fix up. 00435 $out = ''; 00436 $last = 0; 00437 foreach( $replace as $rep ) { 00438 list( $replacement, $start, $length ) = $rep; 00439 if( $last < $start ) { 00440 $out .= substr( $string, $last, $start - $last ); 00441 } 00442 $out .= $replacement; 00443 $last = $start + $length; 00444 } 00445 if( $last < strlen( $string ) ) { 00446 $out .= substr( $string, $last ); 00447 } 00448 $string = $out; 00449 } 00450 return $looksNormal; 00451 } 00452 00453 # These take a string and run the normalization on them, without 00454 # checking for validity or any optimization etc. Input must be 00455 # VALID UTF-8! 00456 00461 static function NFC( $string ) { 00462 return UtfNormal::fastCompose( UtfNormal::NFD( $string ) ); 00463 } 00464 00470 static function NFD( $string ) { 00471 UtfNormal::loadData(); 00472 00473 return UtfNormal::fastCombiningSort( 00474 UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) ); 00475 } 00476 00482 static function NFKC( $string ) { 00483 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) ); 00484 } 00485 00491 static function NFKD( $string ) { 00492 if( !isset( self::$utfCompatibilityDecomp ) ) { 00493 require_once( 'UtfNormalDataK.inc' ); 00494 } 00495 return self::fastCombiningSort( 00496 self::fastDecompose( $string, self::$utfCompatibilityDecomp ) ); 00497 } 00498 00499 00509 static function fastDecompose( $string, $map ) { 00510 UtfNormal::loadData(); 00511 $len = strlen( $string ); 00512 $out = ''; 00513 for( $i = 0; $i < $len; $i++ ) { 00514 $c = $string[$i]; 00515 $n = ord( $c ); 00516 if( $n < 0x80 ) { 00517 # ASCII chars never decompose 00518 # THEY ARE IMMORTAL 00519 $out .= $c; 00520 continue; 00521 } elseif( $n >= 0xf0 ) { 00522 $c = substr( $string, $i, 4 ); 00523 $i += 3; 00524 } elseif( $n >= 0xe0 ) { 00525 $c = substr( $string, $i, 3 ); 00526 $i += 2; 00527 } elseif( $n >= 0xc0 ) { 00528 $c = substr( $string, $i, 2 ); 00529 $i++; 00530 } 00531 if( isset( $map[$c] ) ) { 00532 $out .= $map[$c]; 00533 continue; 00534 } else { 00535 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) { 00536 # Decompose a hangul syllable into jamo; 00537 # hardcoded for three-byte UTF-8 sequence. 00538 # A lookup table would be slightly faster, 00539 # but adds a lot of memory & disk needs. 00540 # 00541 $index = ( (ord( $c[0] ) & 0x0f) << 12 00542 | (ord( $c[1] ) & 0x3f) << 6 00543 | (ord( $c[2] ) & 0x3f) ) 00544 - UNICODE_HANGUL_FIRST; 00545 $l = intval( $index / UNICODE_HANGUL_NCOUNT ); 00546 $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT); 00547 $t = $index % UNICODE_HANGUL_TCOUNT; 00548 $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v ); 00549 if( $t >= 25 ) { 00550 $out .= "\xe1\x87" . chr( 0x80 + $t - 25 ); 00551 } elseif( $t ) { 00552 $out .= "\xe1\x86" . chr( 0xa7 + $t ); 00553 } 00554 continue; 00555 } 00556 } 00557 $out .= $c; 00558 } 00559 return $out; 00560 } 00561 00569 static function fastCombiningSort( $string ) { 00570 UtfNormal::loadData(); 00571 $len = strlen( $string ); 00572 $out = ''; 00573 $combiners = array(); 00574 $lastClass = -1; 00575 for( $i = 0; $i < $len; $i++ ) { 00576 $c = $string[$i]; 00577 $n = ord( $c ); 00578 if( $n >= 0x80 ) { 00579 if( $n >= 0xf0 ) { 00580 $c = substr( $string, $i, 4 ); 00581 $i += 3; 00582 } elseif( $n >= 0xe0 ) { 00583 $c = substr( $string, $i, 3 ); 00584 $i += 2; 00585 } elseif( $n >= 0xc0 ) { 00586 $c = substr( $string, $i, 2 ); 00587 $i++; 00588 } 00589 if( isset( self::$utfCombiningClass[$c] ) ) { 00590 $lastClass = self::$utfCombiningClass[$c]; 00591 if( isset( $combiners[$lastClass] ) ) { 00592 $combiners[$lastClass] .= $c; 00593 } else { 00594 $combiners[$lastClass] = $c; 00595 } 00596 continue; 00597 } 00598 } 00599 if( $lastClass ) { 00600 ksort( $combiners ); 00601 $out .= implode( '', $combiners ); 00602 $combiners = array(); 00603 } 00604 $out .= $c; 00605 $lastClass = 0; 00606 } 00607 if( $lastClass ) { 00608 ksort( $combiners ); 00609 $out .= implode( '', $combiners ); 00610 } 00611 return $out; 00612 } 00613 00621 static function fastCompose( $string ) { 00622 UtfNormal::loadData(); 00623 $len = strlen( $string ); 00624 $out = ''; 00625 $lastClass = -1; 00626 $lastHangul = 0; 00627 $startChar = ''; 00628 $combining = ''; 00629 $x1 = ord(substr(UTF8_HANGUL_VBASE,0,1)); 00630 $x2 = ord(substr(UTF8_HANGUL_TEND,0,1)); 00631 for( $i = 0; $i < $len; $i++ ) { 00632 $c = $string[$i]; 00633 $n = ord( $c ); 00634 if( $n < 0x80 ) { 00635 # No combining characters here... 00636 $out .= $startChar; 00637 $out .= $combining; 00638 $startChar = $c; 00639 $combining = ''; 00640 $lastClass = 0; 00641 continue; 00642 } elseif( $n >= 0xf0 ) { 00643 $c = substr( $string, $i, 4 ); 00644 $i += 3; 00645 } elseif( $n >= 0xe0 ) { 00646 $c = substr( $string, $i, 3 ); 00647 $i += 2; 00648 } elseif( $n >= 0xc0 ) { 00649 $c = substr( $string, $i, 2 ); 00650 $i++; 00651 } 00652 $pair = $startChar . $c; 00653 if( $n > 0x80 ) { 00654 if( isset( self::$utfCombiningClass[$c] ) ) { 00655 # A combining char; see what we can do with it 00656 $class = self::$utfCombiningClass[$c]; 00657 if( !empty( $startChar ) && 00658 $lastClass < $class && 00659 $class > 0 && 00660 isset( self::$utfCanonicalComp[$pair] ) ) { 00661 $startChar = self::$utfCanonicalComp[$pair]; 00662 $class = 0; 00663 } else { 00664 $combining .= $c; 00665 } 00666 $lastClass = $class; 00667 $lastHangul = 0; 00668 continue; 00669 } 00670 } 00671 # New start char 00672 if( $lastClass == 0 ) { 00673 if( isset( self::$utfCanonicalComp[$pair] ) ) { 00674 $startChar = self::$utfCanonicalComp[$pair]; 00675 $lastHangul = 0; 00676 continue; 00677 } 00678 if( $n >= $x1 && $n <= $x2 ) { 00679 # WARNING: Hangul code is painfully slow. 00680 # I apologize for this ugly, ugly code; however 00681 # performance is even more teh suck if we call 00682 # out to nice clean functions. Lookup tables are 00683 # marginally faster, but require a lot of space. 00684 # 00685 if( $c >= UTF8_HANGUL_VBASE && 00686 $c <= UTF8_HANGUL_VEND && 00687 $startChar >= UTF8_HANGUL_LBASE && 00688 $startChar <= UTF8_HANGUL_LEND ) { 00689 # 00690 #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE; 00691 #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE; 00692 $lIndex = ord( $startChar[2] ) - 0x80; 00693 $vIndex = ord( $c[2] ) - 0xa1; 00694 00695 $hangulPoint = UNICODE_HANGUL_FIRST + 00696 UNICODE_HANGUL_TCOUNT * 00697 (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex); 00698 00699 # Hardcode the limited-range UTF-8 conversion: 00700 $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) . 00701 chr( $hangulPoint >> 6 & 0x3f | 0x80 ) . 00702 chr( $hangulPoint & 0x3f | 0x80 ); 00703 $lastHangul = 0; 00704 continue; 00705 } elseif( $c >= UTF8_HANGUL_TBASE && 00706 $c <= UTF8_HANGUL_TEND && 00707 $startChar >= UTF8_HANGUL_FIRST && 00708 $startChar <= UTF8_HANGUL_LAST && 00709 !$lastHangul ) { 00710 # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; 00711 $tIndex = ord( $c[2] ) - 0xa7; 00712 if( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + (0x11c0 - 0x11a7); 00713 00714 # Increment the code point by $tIndex, without 00715 # the function overhead of decoding and recoding UTF-8 00716 # 00717 $tail = ord( $startChar[2] ) + $tIndex; 00718 if( $tail > 0xbf ) { 00719 $tail -= 0x40; 00720 $mid = ord( $startChar[1] ) + 1; 00721 if( $mid > 0xbf ) { 00722 $startChar[0] = chr( ord( $startChar[0] ) + 1 ); 00723 $mid -= 0x40; 00724 } 00725 $startChar[1] = chr( $mid ); 00726 } 00727 $startChar[2] = chr( $tail ); 00728 00729 # If there's another jamo char after this, *don't* try to merge it. 00730 $lastHangul = 1; 00731 continue; 00732 } 00733 } 00734 } 00735 $out .= $startChar; 00736 $out .= $combining; 00737 $startChar = $c; 00738 $combining = ''; 00739 $lastClass = 0; 00740 $lastHangul = 0; 00741 } 00742 $out .= $startChar . $combining; 00743 return $out; 00744 } 00745 00752 static function placebo( $string ) { 00753 $len = strlen( $string ); 00754 $out = ''; 00755 for( $i = 0; $i < $len; $i++ ) { 00756 $out .= $string[$i]; 00757 } 00758 return $out; 00759 } 00767 private static function replaceForNativeNormalize( $string ) { 00768 $string = preg_replace( 00769 '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', 00770 UTF8_REPLACEMENT, 00771 $string ); 00772 $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string ); 00773 $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string ); 00774 return $string; 00775 } 00776 }