MediaWiki  REL1_22
UtfNormalTest2.php
Go to the documentation of this file.
00001 #!/usr/bin/env php
00002 <?php
00025 if( PHP_SAPI != 'cli' ) {
00026     die( "Run me from the command line please.\n" );
00027 }
00028 
00029 // From http://unicode.org/Public/UNIDATA/NormalizationTest.txt
00030 $file = "NormalizationTest.txt";
00031 
00032 // Anything after this character is a comment
00033 define ( 'COMMENT', '#' );
00034 
00035 // Semicolons are used to separate the columns
00036 define ( 'SEPARATOR', ';' );
00037 
00038 $f = fopen($file, "r");
00039 
00068 require_once './UtfNormal.php';
00069 function normalize_form_c($c)      { return UtfNormal::toNFC($c);  }
00070 function normalize_form_d($c)      { return UtfNormal::toNFD($c);  }
00071 function normalize_form_kc($c)     { return UtfNormal::toNFKC($c); }
00072 function normalize_form_kd($c)     { return UtfNormal::toNFKD($c); }
00073 
00081 function normalize_form_c_php($c)  { return UtfNormal::toNFC($c, "php");  }
00082 function normalize_form_d_php($c)  { return UtfNormal::toNFD($c, "php");  }
00083 function normalize_form_kc_php($c) { return UtfNormal::toNFKC($c, "php"); }
00084 function normalize_form_kd_php($c) { return UtfNormal::toNFKD($c, "php"); }
00085 
00086 assert_options(ASSERT_ACTIVE, 1);
00087 assert_options(ASSERT_WARNING, 0);
00088 assert_options(ASSERT_QUIET_EVAL, 1);
00089 assert_options(ASSERT_CALLBACK, 'my_assert');
00090 
00091 function my_assert( $file, $line, $code ) {
00092     global $col, $lineNo;
00093     echo "Assertion that '$code' failed on line $lineNo ($col[5])\n";
00094 }
00095 
00096 $count = 0;
00097 $lineNo = 0;
00098 if( $f !== false ) {
00099     while( ( $col = getRow( $f ) ) !== false ) {
00100         $lineNo++;
00101 
00102         if(count($col) == 6) {
00103             $count++;
00104             if( $count % 100 === 0 ) echo "Count: $count\n";
00105         } else {
00106             continue;
00107         }
00108 
00109         # verify that the pure PHP version is correct
00110         $NFCc1  = normalize_form_c($col[0]);
00111         $NFCc1p = normalize_form_c_php($col[0]);
00112         assert('$NFCc1 === $NFCc1p');
00113         $NFCc2  = normalize_form_c($col[1]);
00114         $NFCc2p = normalize_form_c_php($col[1]);
00115         assert('$NFCc2 === $NFCc2p');
00116         $NFCc3  = normalize_form_c($col[2]);
00117         $NFCc3p = normalize_form_c_php($col[2]);
00118         assert('$NFCc3 === $NFCc3p');
00119         $NFCc4  = normalize_form_c($col[3]);
00120         $NFCc4p = normalize_form_c_php($col[3]);
00121         assert('$NFCc4 === $NFCc4p');
00122         $NFCc5  = normalize_form_c($col[4]);
00123         $NFCc5p = normalize_form_c_php($col[4]);
00124         assert('$NFCc5 === $NFCc5p');
00125 
00126         $NFDc1  = normalize_form_d($col[0]);
00127         $NFDc1p = normalize_form_d_php($col[0]);
00128         assert('$NFDc1 === $NFDc1p');
00129         $NFDc2  = normalize_form_d($col[1]);
00130         $NFDc2p = normalize_form_d_php($col[1]);
00131         assert('$NFDc2 === $NFDc2p');
00132         $NFDc3  = normalize_form_d($col[2]);
00133         $NFDc3p = normalize_form_d_php($col[2]);
00134         assert('$NFDc3 === $NFDc3p');
00135         $NFDc4  = normalize_form_d($col[3]);
00136         $NFDc4p = normalize_form_d_php($col[3]);
00137         assert('$NFDc4 === $NFDc4p');
00138         $NFDc5  = normalize_form_d($col[4]);
00139         $NFDc5p = normalize_form_d_php($col[4]);
00140         assert('$NFDc5 === $NFDc5p');
00141 
00142         $NFKDc1  = normalize_form_kd($col[0]);
00143         $NFKDc1p = normalize_form_kd_php($col[0]);
00144         assert('$NFKDc1 === $NFKDc1p');
00145         $NFKDc2  = normalize_form_kd($col[1]);
00146         $NFKDc2p = normalize_form_kd_php($col[1]);
00147         assert('$NFKDc2 === $NFKDc2p');
00148         $NFKDc3  = normalize_form_kd($col[2]);
00149         $NFKDc3p = normalize_form_kd_php($col[2]);
00150         assert('$NFKDc3 === $NFKDc3p');
00151         $NFKDc4  = normalize_form_kd($col[3]);
00152         $NFKDc4p = normalize_form_kd_php($col[3]);
00153         assert('$NFKDc4 === $NFKDc4p');
00154         $NFKDc5  = normalize_form_kd($col[4]);
00155         $NFKDc5p = normalize_form_kd_php($col[4]);
00156         assert('$NFKDc5 === $NFKDc5p');
00157 
00158         $NFKCc1  = normalize_form_kc($col[0]);
00159         $NFKCc1p = normalize_form_kc_php($col[0]);
00160         assert('$NFKCc1 === $NFKCc1p');
00161         $NFKCc2  = normalize_form_kc($col[1]);
00162         $NFKCc2p = normalize_form_kc_php($col[1]);
00163         assert('$NFKCc2 === $NFKCc2p');
00164         $NFKCc3  = normalize_form_kc($col[2]);
00165         $NFKCc3p = normalize_form_kc_php($col[2]);
00166         assert('$NFKCc3 === $NFKCc3p');
00167         $NFKCc4  = normalize_form_kc($col[3]);
00168         $NFKCc4p = normalize_form_kc_php($col[3]);
00169         assert('$NFKCc4 === $NFKCc4p');
00170         $NFKCc5  = normalize_form_kc($col[4]);
00171         $NFKCc5p = normalize_form_kc_php($col[4]);
00172         assert('$NFKCc5 === $NFKCc5p');
00173 
00174         # c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
00175         assert('$col[1] === $NFCc1');
00176         assert('$col[1] === $NFCc2');
00177         assert('$col[1] === $NFCc3');
00178 
00179         # c4 ==  NFC(c4) ==  NFC(c5)
00180         assert('$col[3] === $NFCc4');
00181         assert('$col[3] === $NFCc5');
00182 
00183         # c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
00184         assert('$col[2] === $NFDc1');
00185         assert('$col[2] === $NFDc2');
00186         assert('$col[2] === $NFDc3');
00187 
00188         # c5 ==  NFD(c4) ==  NFD(c5)
00189         assert('$col[4] === $NFDc4');
00190         assert('$col[4] === $NFDc5');
00191 
00192         # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
00193         assert('$col[3] === $NFKCc1');
00194         assert('$col[3] === $NFKCc2');
00195         assert('$col[3] === $NFKCc3');
00196         assert('$col[3] === $NFKCc4');
00197         assert('$col[3] === $NFKCc5');
00198 
00199         # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
00200         assert('$col[4] === $NFKDc1');
00201         assert('$col[4] === $NFKDc2');
00202         assert('$col[4] === $NFKDc3');
00203         assert('$col[4] === $NFKDc4');
00204         assert('$col[4] === $NFKDc5');
00205     }
00206 }
00207 echo "done.\n";
00208 
00209 // Compare against http://en.wikipedia.org/wiki/UTF-8#Description
00210 function unichr($c) {
00211     if ($c <= 0x7F) {
00212         return chr($c);
00213     } elseif ($c <= 0x7FF) {
00214         return chr(0xC0 | $c >> 6) . chr(0x80 | $c & 0x3F);
00215     } elseif ($c <= 0xFFFF) {
00216         return chr(0xE0 | $c >> 12) . chr(0x80 | $c >> 6 & 0x3F)
00217             . chr(0x80 | $c & 0x3F);
00218     } elseif ($c <= 0x10FFFF) {
00219         return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F)
00220             . chr(0x80 | $c >> 6 & 0x3F)
00221             . chr(0x80 | $c & 0x3F);
00222     } else {
00223         return false;
00224     }
00225 }
00226 
00227 function unistr($c) {
00228     return implode("", array_map("unichr", array_map("hexdec", explode(" ", $c))));
00229 }
00230 
00231 function getRow( $f ) {
00232     $row = fgets( $f );
00233     if( $row === false ) return false;
00234     $row = rtrim($row);
00235     $pos = strpos( $row, COMMENT );
00236     $pos2 = strpos( $row, ")" );
00237     if( $pos === 0 ) return array($row);
00238     $c = "";
00239 
00240     if( $pos ) {
00241         if($pos2) $c = substr( $row, $pos2 + 2 );
00242         else      $c = substr( $row, $pos );
00243         $row = substr( $row, 0, $pos );
00244     }
00245 
00246     $ret = array();
00247     foreach( explode( SEPARATOR, $row ) as $ent ) {
00248         if( trim( $ent ) !== "" ) {
00249             $ret[] = unistr($ent);
00250         }
00251     }
00252     $ret[] = $c;
00253 
00254     return $ret;
00255 }