MediaWiki
REL1_22
|
00001 #!/usr/bin/env php 00002 <?php 00025 if( PHP_SAPI != 'cli' ) { 00026 die( "Run me from the command line please.\n" ); 00027 } 00028 00029 // From http://unicode.org/Public/UNIDATA/NormalizationTest.txt 00030 $file = "NormalizationTest.txt"; 00031 00032 // Anything after this character is a comment 00033 define ( 'COMMENT', '#' ); 00034 00035 // Semicolons are used to separate the columns 00036 define ( 'SEPARATOR', ';' ); 00037 00038 $f = fopen($file, "r"); 00039 00068 require_once './UtfNormal.php'; 00069 function normalize_form_c($c) { return UtfNormal::toNFC($c); } 00070 function normalize_form_d($c) { return UtfNormal::toNFD($c); } 00071 function normalize_form_kc($c) { return UtfNormal::toNFKC($c); } 00072 function normalize_form_kd($c) { return UtfNormal::toNFKD($c); } 00073 00081 function normalize_form_c_php($c) { return UtfNormal::toNFC($c, "php"); } 00082 function normalize_form_d_php($c) { return UtfNormal::toNFD($c, "php"); } 00083 function normalize_form_kc_php($c) { return UtfNormal::toNFKC($c, "php"); } 00084 function normalize_form_kd_php($c) { return UtfNormal::toNFKD($c, "php"); } 00085 00086 assert_options(ASSERT_ACTIVE, 1); 00087 assert_options(ASSERT_WARNING, 0); 00088 assert_options(ASSERT_QUIET_EVAL, 1); 00089 assert_options(ASSERT_CALLBACK, 'my_assert'); 00090 00091 function my_assert( $file, $line, $code ) { 00092 global $col, $lineNo; 00093 echo "Assertion that '$code' failed on line $lineNo ($col[5])\n"; 00094 } 00095 00096 $count = 0; 00097 $lineNo = 0; 00098 if( $f !== false ) { 00099 while( ( $col = getRow( $f ) ) !== false ) { 00100 $lineNo++; 00101 00102 if(count($col) == 6) { 00103 $count++; 00104 if( $count % 100 === 0 ) echo "Count: $count\n"; 00105 } else { 00106 continue; 00107 } 00108 00109 # verify that the pure PHP version is correct 00110 $NFCc1 = normalize_form_c($col[0]); 00111 $NFCc1p = normalize_form_c_php($col[0]); 00112 assert('$NFCc1 === $NFCc1p'); 00113 $NFCc2 = normalize_form_c($col[1]); 00114 $NFCc2p = normalize_form_c_php($col[1]); 00115 assert('$NFCc2 === $NFCc2p'); 00116 $NFCc3 = normalize_form_c($col[2]); 00117 $NFCc3p = normalize_form_c_php($col[2]); 00118 assert('$NFCc3 === $NFCc3p'); 00119 $NFCc4 = normalize_form_c($col[3]); 00120 $NFCc4p = normalize_form_c_php($col[3]); 00121 assert('$NFCc4 === $NFCc4p'); 00122 $NFCc5 = normalize_form_c($col[4]); 00123 $NFCc5p = normalize_form_c_php($col[4]); 00124 assert('$NFCc5 === $NFCc5p'); 00125 00126 $NFDc1 = normalize_form_d($col[0]); 00127 $NFDc1p = normalize_form_d_php($col[0]); 00128 assert('$NFDc1 === $NFDc1p'); 00129 $NFDc2 = normalize_form_d($col[1]); 00130 $NFDc2p = normalize_form_d_php($col[1]); 00131 assert('$NFDc2 === $NFDc2p'); 00132 $NFDc3 = normalize_form_d($col[2]); 00133 $NFDc3p = normalize_form_d_php($col[2]); 00134 assert('$NFDc3 === $NFDc3p'); 00135 $NFDc4 = normalize_form_d($col[3]); 00136 $NFDc4p = normalize_form_d_php($col[3]); 00137 assert('$NFDc4 === $NFDc4p'); 00138 $NFDc5 = normalize_form_d($col[4]); 00139 $NFDc5p = normalize_form_d_php($col[4]); 00140 assert('$NFDc5 === $NFDc5p'); 00141 00142 $NFKDc1 = normalize_form_kd($col[0]); 00143 $NFKDc1p = normalize_form_kd_php($col[0]); 00144 assert('$NFKDc1 === $NFKDc1p'); 00145 $NFKDc2 = normalize_form_kd($col[1]); 00146 $NFKDc2p = normalize_form_kd_php($col[1]); 00147 assert('$NFKDc2 === $NFKDc2p'); 00148 $NFKDc3 = normalize_form_kd($col[2]); 00149 $NFKDc3p = normalize_form_kd_php($col[2]); 00150 assert('$NFKDc3 === $NFKDc3p'); 00151 $NFKDc4 = normalize_form_kd($col[3]); 00152 $NFKDc4p = normalize_form_kd_php($col[3]); 00153 assert('$NFKDc4 === $NFKDc4p'); 00154 $NFKDc5 = normalize_form_kd($col[4]); 00155 $NFKDc5p = normalize_form_kd_php($col[4]); 00156 assert('$NFKDc5 === $NFKDc5p'); 00157 00158 $NFKCc1 = normalize_form_kc($col[0]); 00159 $NFKCc1p = normalize_form_kc_php($col[0]); 00160 assert('$NFKCc1 === $NFKCc1p'); 00161 $NFKCc2 = normalize_form_kc($col[1]); 00162 $NFKCc2p = normalize_form_kc_php($col[1]); 00163 assert('$NFKCc2 === $NFKCc2p'); 00164 $NFKCc3 = normalize_form_kc($col[2]); 00165 $NFKCc3p = normalize_form_kc_php($col[2]); 00166 assert('$NFKCc3 === $NFKCc3p'); 00167 $NFKCc4 = normalize_form_kc($col[3]); 00168 $NFKCc4p = normalize_form_kc_php($col[3]); 00169 assert('$NFKCc4 === $NFKCc4p'); 00170 $NFKCc5 = normalize_form_kc($col[4]); 00171 $NFKCc5p = normalize_form_kc_php($col[4]); 00172 assert('$NFKCc5 === $NFKCc5p'); 00173 00174 # c2 == NFC(c1) == NFC(c2) == NFC(c3) 00175 assert('$col[1] === $NFCc1'); 00176 assert('$col[1] === $NFCc2'); 00177 assert('$col[1] === $NFCc3'); 00178 00179 # c4 == NFC(c4) == NFC(c5) 00180 assert('$col[3] === $NFCc4'); 00181 assert('$col[3] === $NFCc5'); 00182 00183 # c3 == NFD(c1) == NFD(c2) == NFD(c3) 00184 assert('$col[2] === $NFDc1'); 00185 assert('$col[2] === $NFDc2'); 00186 assert('$col[2] === $NFDc3'); 00187 00188 # c5 == NFD(c4) == NFD(c5) 00189 assert('$col[4] === $NFDc4'); 00190 assert('$col[4] === $NFDc5'); 00191 00192 # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) 00193 assert('$col[3] === $NFKCc1'); 00194 assert('$col[3] === $NFKCc2'); 00195 assert('$col[3] === $NFKCc3'); 00196 assert('$col[3] === $NFKCc4'); 00197 assert('$col[3] === $NFKCc5'); 00198 00199 # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) 00200 assert('$col[4] === $NFKDc1'); 00201 assert('$col[4] === $NFKDc2'); 00202 assert('$col[4] === $NFKDc3'); 00203 assert('$col[4] === $NFKDc4'); 00204 assert('$col[4] === $NFKDc5'); 00205 } 00206 } 00207 echo "done.\n"; 00208 00209 // Compare against http://en.wikipedia.org/wiki/UTF-8#Description 00210 function unichr($c) { 00211 if ($c <= 0x7F) { 00212 return chr($c); 00213 } elseif ($c <= 0x7FF) { 00214 return chr(0xC0 | $c >> 6) . chr(0x80 | $c & 0x3F); 00215 } elseif ($c <= 0xFFFF) { 00216 return chr(0xE0 | $c >> 12) . chr(0x80 | $c >> 6 & 0x3F) 00217 . chr(0x80 | $c & 0x3F); 00218 } elseif ($c <= 0x10FFFF) { 00219 return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F) 00220 . chr(0x80 | $c >> 6 & 0x3F) 00221 . chr(0x80 | $c & 0x3F); 00222 } else { 00223 return false; 00224 } 00225 } 00226 00227 function unistr($c) { 00228 return implode("", array_map("unichr", array_map("hexdec", explode(" ", $c)))); 00229 } 00230 00231 function getRow( $f ) { 00232 $row = fgets( $f ); 00233 if( $row === false ) return false; 00234 $row = rtrim($row); 00235 $pos = strpos( $row, COMMENT ); 00236 $pos2 = strpos( $row, ")" ); 00237 if( $pos === 0 ) return array($row); 00238 $c = ""; 00239 00240 if( $pos ) { 00241 if($pos2) $c = substr( $row, $pos2 + 2 ); 00242 else $c = substr( $row, $pos ); 00243 $row = substr( $row, 0, $pos ); 00244 } 00245 00246 $ret = array(); 00247 foreach( explode( SEPARATOR, $row ) as $ent ) { 00248 if( trim( $ent ) !== "" ) { 00249 $ret[] = unistr($ent); 00250 } 00251 } 00252 $ret[] = $c; 00253 00254 return $ret; 00255 }