MediaWiki  REL1_19
UtfNormalTest2.php
Go to the documentation of this file.
00001 #!/usr/bin/php
00002 <?php
00010 if( php_sapi_name() != 'cli' ) {
00011         die( "Run me from the command line please.\n" );
00012 }
00013 
00014 // From http://unicode.org/Public/UNIDATA/NormalizationTest.txt
00015 $file = "NormalizationTest.txt";
00016 
00017 // Anything after this character is a comment
00018 define ( 'COMMENT', '#' );
00019 
00020 // Semicolons are used to separate the columns
00021 define ( 'SEPARATOR', ';' );
00022 
00023 $f = fopen($file, "r");
00024 
00053 require_once("./UtfNormal.php");
00054 function normalize_form_c($c)      { return UtfNormal::toNFC($c);  }
00055 function normalize_form_d($c)      { return UtfNormal::toNFD($c);  }
00056 function normalize_form_kc($c)     { return UtfNormal::toNFKC($c); }
00057 function normalize_form_kd($c)     { return UtfNormal::toNFKD($c); }
00058 
00065 function normalize_form_c_php($c)  { return UtfNormal::toNFC($c, "php");  }
00066 function normalize_form_d_php($c)  { return UtfNormal::toNFD($c, "php");  }
00067 function normalize_form_kc_php($c) { return UtfNormal::toNFKC($c, "php"); }
00068 function normalize_form_kd_php($c) { return UtfNormal::toNFKD($c, "php"); }
00069 
00070 assert_options(ASSERT_ACTIVE, 1);
00071 assert_options(ASSERT_WARNING, 0);
00072 assert_options(ASSERT_QUIET_EVAL, 1);
00073 assert_options(ASSERT_CALLBACK, 'my_assert');
00074 
00075 function my_assert( $file, $line, $code ) {
00076         global $col, $lineNo;
00077         echo "Assertion that '$code' failed on line $lineNo ($col[5])\n";
00078 }
00079 
00080 $count = 0;
00081 $lineNo = 0;
00082 if( $f !== false ) {
00083         while( ( $col = getRow( $f ) ) !== false ) {
00084                 $lineNo++;
00085 
00086                 if(count($col) == 6) {
00087                         $count++;
00088                         if( $count % 100 === 0 ) echo "Count: $count\n";
00089                 } else {
00090                         continue;
00091                 }
00092 
00093                 # verify that the pure PHP version is correct
00094                 $NFCc1  = normalize_form_c($col[0]);
00095                 $NFCc1p = normalize_form_c_php($col[0]);
00096                 assert('$NFCc1 === $NFCc1p');
00097                 $NFCc2  = normalize_form_c($col[1]);
00098                 $NFCc2p = normalize_form_c_php($col[1]);
00099                 assert('$NFCc2 === $NFCc2p');
00100                 $NFCc3  = normalize_form_c($col[2]);
00101                 $NFCc3p = normalize_form_c_php($col[2]);
00102                 assert('$NFCc3 === $NFCc3p');
00103                 $NFCc4  = normalize_form_c($col[3]);
00104                 $NFCc4p = normalize_form_c_php($col[3]);
00105                 assert('$NFCc4 === $NFCc4p');
00106                 $NFCc5  = normalize_form_c($col[4]);
00107                 $NFCc5p = normalize_form_c_php($col[4]);
00108                 assert('$NFCc5 === $NFCc5p');
00109 
00110                 $NFDc1  = normalize_form_d($col[0]);
00111                 $NFDc1p = normalize_form_d_php($col[0]);
00112                 assert('$NFDc1 === $NFDc1p');
00113                 $NFDc2  = normalize_form_d($col[1]);
00114                 $NFDc2p = normalize_form_d_php($col[1]);
00115                 assert('$NFDc2 === $NFDc2p');
00116                 $NFDc3  = normalize_form_d($col[2]);
00117                 $NFDc3p = normalize_form_d_php($col[2]);
00118                 assert('$NFDc3 === $NFDc3p');
00119                 $NFDc4  = normalize_form_d($col[3]);
00120                 $NFDc4p = normalize_form_d_php($col[3]);
00121                 assert('$NFDc4 === $NFDc4p');
00122                 $NFDc5  = normalize_form_d($col[4]);
00123                 $NFDc5p = normalize_form_d_php($col[4]);
00124                 assert('$NFDc5 === $NFDc5p');
00125 
00126                 $NFKDc1  = normalize_form_kd($col[0]);
00127                 $NFKDc1p = normalize_form_kd_php($col[0]);
00128                 assert('$NFKDc1 === $NFKDc1p');
00129                 $NFKDc2  = normalize_form_kd($col[1]);
00130                 $NFKDc2p = normalize_form_kd_php($col[1]);
00131                 assert('$NFKDc2 === $NFKDc2p');
00132                 $NFKDc3  = normalize_form_kd($col[2]);
00133                 $NFKDc3p = normalize_form_kd_php($col[2]);
00134                 assert('$NFKDc3 === $NFKDc3p');
00135                 $NFKDc4  = normalize_form_kd($col[3]);
00136                 $NFKDc4p = normalize_form_kd_php($col[3]);
00137                 assert('$NFKDc4 === $NFKDc4p');
00138                 $NFKDc5  = normalize_form_kd($col[4]);
00139                 $NFKDc5p = normalize_form_kd_php($col[4]);
00140                 assert('$NFKDc5 === $NFKDc5p');
00141 
00142                 $NFKCc1  = normalize_form_kc($col[0]);
00143                 $NFKCc1p = normalize_form_kc_php($col[0]);
00144                 assert('$NFKCc1 === $NFKCc1p');
00145                 $NFKCc2  = normalize_form_kc($col[1]);
00146                 $NFKCc2p = normalize_form_kc_php($col[1]);
00147                 assert('$NFKCc2 === $NFKCc2p');
00148                 $NFKCc3  = normalize_form_kc($col[2]);
00149                 $NFKCc3p = normalize_form_kc_php($col[2]);
00150                 assert('$NFKCc3 === $NFKCc3p');
00151                 $NFKCc4  = normalize_form_kc($col[3]);
00152                 $NFKCc4p = normalize_form_kc_php($col[3]);
00153                 assert('$NFKCc4 === $NFKCc4p');
00154                 $NFKCc5  = normalize_form_kc($col[4]);
00155                 $NFKCc5p = normalize_form_kc_php($col[4]);
00156                 assert('$NFKCc5 === $NFKCc5p');
00157 
00158                 # c2 ==  NFC(c1) ==      NFC(c2) ==      NFC(c3)
00159                 assert('$col[1] === $NFCc1');
00160                 assert('$col[1] === $NFCc2');
00161                 assert('$col[1] === $NFCc3');
00162 
00163                 # c4 ==  NFC(c4) ==      NFC(c5)
00164                 assert('$col[3] === $NFCc4');
00165                 assert('$col[3] === $NFCc5');
00166 
00167                 # c3 ==  NFD(c1) ==      NFD(c2) ==      NFD(c3)
00168                 assert('$col[2] === $NFDc1');
00169                 assert('$col[2] === $NFDc2');
00170                 assert('$col[2] === $NFDc3');
00171 
00172                 # c5 ==  NFD(c4) ==      NFD(c5)
00173                 assert('$col[4] === $NFDc4');
00174                 assert('$col[4] === $NFDc5');
00175 
00176                 # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
00177                 assert('$col[3] === $NFKCc1');
00178                 assert('$col[3] === $NFKCc2');
00179                 assert('$col[3] === $NFKCc3');
00180                 assert('$col[3] === $NFKCc4');
00181                 assert('$col[3] === $NFKCc5');
00182 
00183                 # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
00184                 assert('$col[4] === $NFKDc1');
00185                 assert('$col[4] === $NFKDc2');
00186                 assert('$col[4] === $NFKDc3');
00187                 assert('$col[4] === $NFKDc4');
00188                 assert('$col[4] === $NFKDc5');
00189         }
00190 }
00191 echo "done.\n";
00192 
00193 // Compare against http://en.wikipedia.org/wiki/UTF-8#Description
00194 function unichr($c) {
00195         if ($c <= 0x7F) {
00196                 return chr($c);
00197         } elseif ($c <= 0x7FF) {
00198                 return chr(0xC0 | $c >> 6) . chr(0x80 | $c & 0x3F);
00199         } elseif ($c <= 0xFFFF) {
00200                 return chr(0xE0 | $c >> 12) . chr(0x80 | $c >> 6 & 0x3F)
00201                         . chr(0x80 | $c & 0x3F);
00202         } elseif ($c <= 0x10FFFF) {
00203                 return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F)
00204                         . chr(0x80 | $c >> 6 & 0x3F)
00205                         . chr(0x80 | $c & 0x3F);
00206         } else {
00207                 return false;
00208         }
00209 }
00210 
00211 function unistr($c) {
00212         return implode("", array_map("unichr", array_map("hexdec", explode(" ", $c))));
00213 }
00214 
00215 function getRow( $f ) {
00216         $row = fgets( $f );
00217         if( $row === false ) return false;
00218         $row = rtrim($row);
00219         $pos = strpos( $row, COMMENT );
00220         $pos2 = strpos( $row, ")" );
00221         if( $pos === 0 ) return array($row);
00222         $c = "";
00223 
00224         if( $pos ) {
00225                 if($pos2) $c = substr( $row, $pos2 + 2 );
00226                 else      $c = substr( $row, $pos );
00227                 $row = substr( $row, 0, $pos );
00228         }
00229 
00230         $ret = array();
00231         foreach( explode( SEPARATOR, $row ) as $ent ) {
00232                 if( trim( $ent ) !== "" ) {
00233                         $ret[] = unistr($ent);
00234                 }
00235         }
00236         $ret[] = $c;
00237 
00238         return $ret;
00239 }