MediaWiki
REL1_19
|
00001 #!/usr/bin/php 00002 <?php 00010 if( php_sapi_name() != 'cli' ) { 00011 die( "Run me from the command line please.\n" ); 00012 } 00013 00014 // From http://unicode.org/Public/UNIDATA/NormalizationTest.txt 00015 $file = "NormalizationTest.txt"; 00016 00017 // Anything after this character is a comment 00018 define ( 'COMMENT', '#' ); 00019 00020 // Semicolons are used to separate the columns 00021 define ( 'SEPARATOR', ';' ); 00022 00023 $f = fopen($file, "r"); 00024 00053 require_once("./UtfNormal.php"); 00054 function normalize_form_c($c) { return UtfNormal::toNFC($c); } 00055 function normalize_form_d($c) { return UtfNormal::toNFD($c); } 00056 function normalize_form_kc($c) { return UtfNormal::toNFKC($c); } 00057 function normalize_form_kd($c) { return UtfNormal::toNFKD($c); } 00058 00065 function normalize_form_c_php($c) { return UtfNormal::toNFC($c, "php"); } 00066 function normalize_form_d_php($c) { return UtfNormal::toNFD($c, "php"); } 00067 function normalize_form_kc_php($c) { return UtfNormal::toNFKC($c, "php"); } 00068 function normalize_form_kd_php($c) { return UtfNormal::toNFKD($c, "php"); } 00069 00070 assert_options(ASSERT_ACTIVE, 1); 00071 assert_options(ASSERT_WARNING, 0); 00072 assert_options(ASSERT_QUIET_EVAL, 1); 00073 assert_options(ASSERT_CALLBACK, 'my_assert'); 00074 00075 function my_assert( $file, $line, $code ) { 00076 global $col, $lineNo; 00077 echo "Assertion that '$code' failed on line $lineNo ($col[5])\n"; 00078 } 00079 00080 $count = 0; 00081 $lineNo = 0; 00082 if( $f !== false ) { 00083 while( ( $col = getRow( $f ) ) !== false ) { 00084 $lineNo++; 00085 00086 if(count($col) == 6) { 00087 $count++; 00088 if( $count % 100 === 0 ) echo "Count: $count\n"; 00089 } else { 00090 continue; 00091 } 00092 00093 # verify that the pure PHP version is correct 00094 $NFCc1 = normalize_form_c($col[0]); 00095 $NFCc1p = normalize_form_c_php($col[0]); 00096 assert('$NFCc1 === $NFCc1p'); 00097 $NFCc2 = normalize_form_c($col[1]); 00098 $NFCc2p = normalize_form_c_php($col[1]); 00099 assert('$NFCc2 === $NFCc2p'); 00100 $NFCc3 = normalize_form_c($col[2]); 00101 $NFCc3p = normalize_form_c_php($col[2]); 00102 assert('$NFCc3 === $NFCc3p'); 00103 $NFCc4 = normalize_form_c($col[3]); 00104 $NFCc4p = normalize_form_c_php($col[3]); 00105 assert('$NFCc4 === $NFCc4p'); 00106 $NFCc5 = normalize_form_c($col[4]); 00107 $NFCc5p = normalize_form_c_php($col[4]); 00108 assert('$NFCc5 === $NFCc5p'); 00109 00110 $NFDc1 = normalize_form_d($col[0]); 00111 $NFDc1p = normalize_form_d_php($col[0]); 00112 assert('$NFDc1 === $NFDc1p'); 00113 $NFDc2 = normalize_form_d($col[1]); 00114 $NFDc2p = normalize_form_d_php($col[1]); 00115 assert('$NFDc2 === $NFDc2p'); 00116 $NFDc3 = normalize_form_d($col[2]); 00117 $NFDc3p = normalize_form_d_php($col[2]); 00118 assert('$NFDc3 === $NFDc3p'); 00119 $NFDc4 = normalize_form_d($col[3]); 00120 $NFDc4p = normalize_form_d_php($col[3]); 00121 assert('$NFDc4 === $NFDc4p'); 00122 $NFDc5 = normalize_form_d($col[4]); 00123 $NFDc5p = normalize_form_d_php($col[4]); 00124 assert('$NFDc5 === $NFDc5p'); 00125 00126 $NFKDc1 = normalize_form_kd($col[0]); 00127 $NFKDc1p = normalize_form_kd_php($col[0]); 00128 assert('$NFKDc1 === $NFKDc1p'); 00129 $NFKDc2 = normalize_form_kd($col[1]); 00130 $NFKDc2p = normalize_form_kd_php($col[1]); 00131 assert('$NFKDc2 === $NFKDc2p'); 00132 $NFKDc3 = normalize_form_kd($col[2]); 00133 $NFKDc3p = normalize_form_kd_php($col[2]); 00134 assert('$NFKDc3 === $NFKDc3p'); 00135 $NFKDc4 = normalize_form_kd($col[3]); 00136 $NFKDc4p = normalize_form_kd_php($col[3]); 00137 assert('$NFKDc4 === $NFKDc4p'); 00138 $NFKDc5 = normalize_form_kd($col[4]); 00139 $NFKDc5p = normalize_form_kd_php($col[4]); 00140 assert('$NFKDc5 === $NFKDc5p'); 00141 00142 $NFKCc1 = normalize_form_kc($col[0]); 00143 $NFKCc1p = normalize_form_kc_php($col[0]); 00144 assert('$NFKCc1 === $NFKCc1p'); 00145 $NFKCc2 = normalize_form_kc($col[1]); 00146 $NFKCc2p = normalize_form_kc_php($col[1]); 00147 assert('$NFKCc2 === $NFKCc2p'); 00148 $NFKCc3 = normalize_form_kc($col[2]); 00149 $NFKCc3p = normalize_form_kc_php($col[2]); 00150 assert('$NFKCc3 === $NFKCc3p'); 00151 $NFKCc4 = normalize_form_kc($col[3]); 00152 $NFKCc4p = normalize_form_kc_php($col[3]); 00153 assert('$NFKCc4 === $NFKCc4p'); 00154 $NFKCc5 = normalize_form_kc($col[4]); 00155 $NFKCc5p = normalize_form_kc_php($col[4]); 00156 assert('$NFKCc5 === $NFKCc5p'); 00157 00158 # c2 == NFC(c1) == NFC(c2) == NFC(c3) 00159 assert('$col[1] === $NFCc1'); 00160 assert('$col[1] === $NFCc2'); 00161 assert('$col[1] === $NFCc3'); 00162 00163 # c4 == NFC(c4) == NFC(c5) 00164 assert('$col[3] === $NFCc4'); 00165 assert('$col[3] === $NFCc5'); 00166 00167 # c3 == NFD(c1) == NFD(c2) == NFD(c3) 00168 assert('$col[2] === $NFDc1'); 00169 assert('$col[2] === $NFDc2'); 00170 assert('$col[2] === $NFDc3'); 00171 00172 # c5 == NFD(c4) == NFD(c5) 00173 assert('$col[4] === $NFDc4'); 00174 assert('$col[4] === $NFDc5'); 00175 00176 # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) 00177 assert('$col[3] === $NFKCc1'); 00178 assert('$col[3] === $NFKCc2'); 00179 assert('$col[3] === $NFKCc3'); 00180 assert('$col[3] === $NFKCc4'); 00181 assert('$col[3] === $NFKCc5'); 00182 00183 # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) 00184 assert('$col[4] === $NFKDc1'); 00185 assert('$col[4] === $NFKDc2'); 00186 assert('$col[4] === $NFKDc3'); 00187 assert('$col[4] === $NFKDc4'); 00188 assert('$col[4] === $NFKDc5'); 00189 } 00190 } 00191 echo "done.\n"; 00192 00193 // Compare against http://en.wikipedia.org/wiki/UTF-8#Description 00194 function unichr($c) { 00195 if ($c <= 0x7F) { 00196 return chr($c); 00197 } elseif ($c <= 0x7FF) { 00198 return chr(0xC0 | $c >> 6) . chr(0x80 | $c & 0x3F); 00199 } elseif ($c <= 0xFFFF) { 00200 return chr(0xE0 | $c >> 12) . chr(0x80 | $c >> 6 & 0x3F) 00201 . chr(0x80 | $c & 0x3F); 00202 } elseif ($c <= 0x10FFFF) { 00203 return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F) 00204 . chr(0x80 | $c >> 6 & 0x3F) 00205 . chr(0x80 | $c & 0x3F); 00206 } else { 00207 return false; 00208 } 00209 } 00210 00211 function unistr($c) { 00212 return implode("", array_map("unichr", array_map("hexdec", explode(" ", $c)))); 00213 } 00214 00215 function getRow( $f ) { 00216 $row = fgets( $f ); 00217 if( $row === false ) return false; 00218 $row = rtrim($row); 00219 $pos = strpos( $row, COMMENT ); 00220 $pos2 = strpos( $row, ")" ); 00221 if( $pos === 0 ) return array($row); 00222 $c = ""; 00223 00224 if( $pos ) { 00225 if($pos2) $c = substr( $row, $pos2 + 2 ); 00226 else $c = substr( $row, $pos ); 00227 $row = substr( $row, 0, $pos ); 00228 } 00229 00230 $ret = array(); 00231 foreach( explode( SEPARATOR, $row ) as $ent ) { 00232 if( trim( $ent ) !== "" ) { 00233 $ret[] = unistr($ent); 00234 } 00235 } 00236 $ret[] = $c; 00237 00238 return $ret; 00239 }