MediaWiki  REL1_24
UtfNormalTest2.php
Go to the documentation of this file.
00001 #!/usr/bin/env php
00002 <?php
00025 if ( PHP_SAPI != 'cli' ) {
00026     die( "Run me from the command line please.\n" );
00027 }
00028 
00029 // From http://unicode.org/Public/UNIDATA/NormalizationTest.txt
00030 $file = "NormalizationTest.txt";
00031 
00032 // Anything after this character is a comment
00033 define ( 'COMMENT', '#' );
00034 
00035 // Semicolons are used to separate the columns
00036 define ( 'SEPARATOR', ';' );
00037 
00038 $f = fopen( $file, "r" );
00039 
00063 require_once './UtfNormal.php';
00064 function normalize_form_c( $c ) {
00065     return UtfNormal::toNFC( $c );
00066 }
00067 
00068 function normalize_form_d( $c ) {
00069     return UtfNormal::toNFD( $c );
00070 }
00071 
00072 function normalize_form_kc( $c ) {
00073     return UtfNormal::toNFKC( $c );
00074 }
00075 
00076 function normalize_form_kd( $c ) {
00077     return UtfNormal::toNFKD( $c );
00078 }
00079 
00087 function normalize_form_c_php( $c ) {
00088     return UtfNormal::toNFC( $c, "php" );
00089 }
00090 
00091 function normalize_form_d_php( $c ) {
00092     return UtfNormal::toNFD( $c, "php" );
00093 }
00094 
00095 function normalize_form_kc_php( $c ) {
00096     return UtfNormal::toNFKC( $c, "php" );
00097 }
00098 
00099 function normalize_form_kd_php( $c ) {
00100     return UtfNormal::toNFKD( $c, "php" );
00101 }
00102 
00103 assert_options( ASSERT_ACTIVE, 1 );
00104 assert_options( ASSERT_WARNING, 0 );
00105 assert_options( ASSERT_QUIET_EVAL, 1 );
00106 assert_options( ASSERT_CALLBACK, 'my_assert' );
00107 
00108 function my_assert( $file, $line, $code ) {
00109     // @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix
00110     global $col, $lineNo;
00111     // @codingStandardsIgnoreEnd
00112 
00113     echo "Assertion that '$code' failed on line $lineNo ($col[5])\n";
00114 }
00115 
00116 $count = 0;
00117 $lineNo = 0;
00118 if ( $f !== false ) {
00119     while ( ( $col = getRow( $f ) ) !== false ) {
00120         $lineNo++;
00121 
00122         if ( count( $col ) == 6 ) {
00123             $count++;
00124             if ( $count % 100 === 0 ) echo "Count: $count\n";
00125         } else {
00126             continue;
00127         }
00128 
00129         # verify that the pure PHP version is correct
00130         $NFCc1 = normalize_form_c( $col[0] );
00131         $NFCc1p = normalize_form_c_php( $col[0] );
00132         assert( '$NFCc1 === $NFCc1p' );
00133         $NFCc2 = normalize_form_c( $col[1] );
00134         $NFCc2p = normalize_form_c_php( $col[1] );
00135         assert( '$NFCc2 === $NFCc2p' );
00136         $NFCc3 = normalize_form_c( $col[2] );
00137         $NFCc3p = normalize_form_c_php( $col[2] );
00138         assert( '$NFCc3 === $NFCc3p' );
00139         $NFCc4 = normalize_form_c( $col[3] );
00140         $NFCc4p = normalize_form_c_php( $col[3] );
00141         assert( '$NFCc4 === $NFCc4p' );
00142         $NFCc5 = normalize_form_c( $col[4] );
00143         $NFCc5p = normalize_form_c_php( $col[4] );
00144         assert( '$NFCc5 === $NFCc5p' );
00145 
00146         $NFDc1 = normalize_form_d( $col[0] );
00147         $NFDc1p = normalize_form_d_php( $col[0] );
00148         assert( '$NFDc1 === $NFDc1p' );
00149         $NFDc2 = normalize_form_d( $col[1] );
00150         $NFDc2p = normalize_form_d_php( $col[1] );
00151         assert( '$NFDc2 === $NFDc2p' );
00152         $NFDc3 = normalize_form_d( $col[2] );
00153         $NFDc3p = normalize_form_d_php( $col[2] );
00154         assert( '$NFDc3 === $NFDc3p' );
00155         $NFDc4 = normalize_form_d( $col[3] );
00156         $NFDc4p = normalize_form_d_php( $col[3] );
00157         assert( '$NFDc4 === $NFDc4p' );
00158         $NFDc5 = normalize_form_d( $col[4] );
00159         $NFDc5p = normalize_form_d_php( $col[4] );
00160         assert( '$NFDc5 === $NFDc5p' );
00161 
00162         $NFKDc1 = normalize_form_kd( $col[0] );
00163         $NFKDc1p = normalize_form_kd_php( $col[0] );
00164         assert( '$NFKDc1 === $NFKDc1p' );
00165         $NFKDc2 = normalize_form_kd( $col[1] );
00166         $NFKDc2p = normalize_form_kd_php( $col[1] );
00167         assert( '$NFKDc2 === $NFKDc2p' );
00168         $NFKDc3 = normalize_form_kd( $col[2] );
00169         $NFKDc3p = normalize_form_kd_php( $col[2] );
00170         assert( '$NFKDc3 === $NFKDc3p' );
00171         $NFKDc4 = normalize_form_kd( $col[3] );
00172         $NFKDc4p = normalize_form_kd_php( $col[3] );
00173         assert( '$NFKDc4 === $NFKDc4p' );
00174         $NFKDc5 = normalize_form_kd( $col[4] );
00175         $NFKDc5p = normalize_form_kd_php( $col[4] );
00176         assert( '$NFKDc5 === $NFKDc5p' );
00177 
00178         $NFKCc1 = normalize_form_kc( $col[0] );
00179         $NFKCc1p = normalize_form_kc_php( $col[0] );
00180         assert( '$NFKCc1 === $NFKCc1p' );
00181         $NFKCc2 = normalize_form_kc( $col[1] );
00182         $NFKCc2p = normalize_form_kc_php( $col[1] );
00183         assert( '$NFKCc2 === $NFKCc2p' );
00184         $NFKCc3 = normalize_form_kc( $col[2] );
00185         $NFKCc3p = normalize_form_kc_php( $col[2] );
00186         assert( '$NFKCc3 === $NFKCc3p' );
00187         $NFKCc4 = normalize_form_kc( $col[3] );
00188         $NFKCc4p = normalize_form_kc_php( $col[3] );
00189         assert( '$NFKCc4 === $NFKCc4p' );
00190         $NFKCc5 = normalize_form_kc( $col[4] );
00191         $NFKCc5p = normalize_form_kc_php( $col[4] );
00192         assert( '$NFKCc5 === $NFKCc5p' );
00193 
00194         # c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
00195         assert( '$col[1] === $NFCc1' );
00196         assert( '$col[1] === $NFCc2' );
00197         assert( '$col[1] === $NFCc3' );
00198 
00199         # c4 ==  NFC(c4) ==  NFC(c5)
00200         assert( '$col[3] === $NFCc4' );
00201         assert( '$col[3] === $NFCc5' );
00202 
00203         # c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
00204         assert( '$col[2] === $NFDc1' );
00205         assert( '$col[2] === $NFDc2' );
00206         assert( '$col[2] === $NFDc3' );
00207 
00208         # c5 ==  NFD(c4) ==  NFD(c5)
00209         assert( '$col[4] === $NFDc4' );
00210         assert( '$col[4] === $NFDc5' );
00211 
00212         # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
00213         assert( '$col[3] === $NFKCc1' );
00214         assert( '$col[3] === $NFKCc2' );
00215         assert( '$col[3] === $NFKCc3' );
00216         assert( '$col[3] === $NFKCc4' );
00217         assert( '$col[3] === $NFKCc5' );
00218 
00219         # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
00220         assert( '$col[4] === $NFKDc1' );
00221         assert( '$col[4] === $NFKDc2' );
00222         assert( '$col[4] === $NFKDc3' );
00223         assert( '$col[4] === $NFKDc4' );
00224         assert( '$col[4] === $NFKDc5' );
00225     }
00226 }
00227 echo "done.\n";
00228 
00229 // Compare against http://en.wikipedia.org/wiki/UTF-8#Description
00230 function unichr( $c ) {
00231     if ( $c <= 0x7F ) {
00232         return chr( $c );
00233     } elseif ( $c <= 0x7FF ) {
00234         return chr( 0xC0 | $c >> 6 ) . chr( 0x80 | $c & 0x3F );
00235     } elseif ( $c <= 0xFFFF ) {
00236         return chr( 0xE0 | $c >> 12 ) . chr( 0x80 | $c >> 6 & 0x3F )
00237         . chr( 0x80 | $c & 0x3F );
00238     } elseif ( $c <= 0x10FFFF ) {
00239         return chr( 0xF0 | $c >> 18 ) . chr( 0x80 | $c >> 12 & 0x3F )
00240         . chr( 0x80 | $c >> 6 & 0x3F )
00241         . chr( 0x80 | $c & 0x3F );
00242     } else {
00243         return false;
00244     }
00245 }
00246 
00247 function unistr( $c ) {
00248     return implode( "", array_map( "unichr", array_map( "hexdec", explode( " ", $c ) ) ) );
00249 }
00250 
00251 function getRow( $f ) {
00252     $row = fgets( $f );
00253     if ( $row === false ) return false;
00254     $row = rtrim( $row );
00255     $pos = strpos( $row, COMMENT );
00256     $pos2 = strpos( $row, ")" );
00257     if ( $pos === 0 ) return array( $row );
00258     $c = "";
00259 
00260     if ( $pos ) {
00261         if ( $pos2 ) $c = substr( $row, $pos2 + 2 );
00262         else      $c = substr( $row, $pos );
00263         $row = substr( $row, 0, $pos );
00264     }
00265 
00266     $ret = array();
00267     foreach ( explode( SEPARATOR, $row ) as $ent ) {
00268         if ( trim( $ent ) !== "" ) {
00269             $ret[] = unistr( $ent );
00270         }
00271     }
00272     $ret[] = $c;
00273 
00274     return $ret;
00275 }