MediaWiki  REL1_24
UtfNormalTest.php
Go to the documentation of this file.
00001 <?php
00028 if ( PHP_SAPI != 'cli' ) {
00029     die( "Run me from the command line please.\n" );
00030 }
00031 
00032 $verbose = true;
00033 #define( 'PRETTY_UTF8', true );
00034 
00035 if ( defined( 'PRETTY_UTF8' ) ) {
00036     function pretty( $string ) {
00037         return strtoupper( bin2hex( $string ) );
00038     }
00039 } else {
00045     function pretty( $string ) {
00046         return strtoupper( utf8ToHexSequence( $string ) );
00047     }
00048 }
00049 
00050 if ( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
00051     dl( 'php_utfnormal.so' );
00052 }
00053 
00054 require_once 'UtfNormalDefines.php';
00055 require_once 'UtfNormalUtil.php';
00056 require_once 'UtfNormal.php';
00057 
00058 $in = fopen( "NormalizationTest.txt", "rt" );
00059 if ( !$in ) {
00060     print "Couldn't open NormalizationTest.txt -- can't run tests.\n";
00061     print "If necessary, manually download this file. It can be obtained at\n";
00062     print "http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt";
00063     exit( -1 );
00064 }
00065 
00066 $normalizer = new UtfNormal;
00067 
00068 $total = 0;
00069 $success = 0;
00070 $failure = 0;
00071 $ok = true;
00072 $testedChars = array();
00073 
00074 while ( false !== ( $line = fgets( $in ) ) ) {
00075     list( $data, $comment ) = explode( '#', $line );
00076     if ( $data === '' ) continue;
00077     $matches = array();
00078     if ( preg_match( '/@Part([\d])/', $data, $matches ) ) {
00079         if ( $matches[1] > 0 ) {
00080             $ok = reportResults( $total, $success, $failure ) && $ok;
00081         }
00082         print "Part {$matches[1]}: $comment";
00083         continue;
00084     }
00085 
00086     $columns = array_map( "hexSequenceToUtf8", explode( ";", $data ) );
00087     array_unshift( $columns, '' );
00088 
00089     $testedChars[$columns[1]] = true;
00090     $total++;
00091     if ( testNormals( $normalizer, $columns, $comment, $verbose ) ) {
00092         $success++;
00093     } else {
00094         $failure++;
00095         # print "FAILED: $comment";
00096     }
00097     if ( $total % 100 == 0 ) print "$total ";
00098 }
00099 fclose( $in );
00100 
00101 $ok = reportResults( $total, $success, $failure ) && $ok;
00102 
00103 $in = fopen( "UnicodeData.txt", "rt" );
00104 if ( !$in ) {
00105     print "Can't open UnicodeData.txt for reading.\n";
00106     print "If necessary, fetch this file from the internet:\n";
00107     print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
00108     exit( -1 );
00109 }
00110 print "Now testing invariants...\n";
00111 
00112 while ( false !== ( $line = fgets( $in ) ) ) {
00113     $cols = explode( ';', $line );
00114     $char = codepointToUtf8( hexdec( $cols[0] ) );
00115     $desc = $cols[0] . ": " . $cols[1];
00116     if ( $char < "\x20" || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
00117         # Can't check NULL with the ICU plugin, as null bytes fail in C land.
00118         # Skip other control characters, as we strip them for XML safety.
00119         # Surrogates are illegal on their own or in UTF-8, ignore.
00120         continue;
00121     }
00122     if ( empty( $testedChars[$char] ) ) {
00123         $total++;
00124         if ( testInvariant( $normalizer, $char, $desc, $verbose ) ) {
00125             $success++;
00126         } else {
00127             $failure++;
00128         }
00129         if ( $total % 100 == 0 ) print "$total ";
00130     }
00131 }
00132 fclose( $in );
00133 
00134 $ok = reportResults( $total, $success, $failure ) && $ok;
00135 
00136 if ( $ok ) {
00137     print "TEST SUCCEEDED!\n";
00138     exit( 0 );
00139 } else {
00140     print "TEST FAILED!\n";
00141     exit( -1 );
00142 }
00143 
00144 ## ------
00145 
00146 function reportResults( &$total, &$success, &$failure ) {
00147     $percSucc = intval( $success * 100 / $total );
00148     $percFail = intval( $failure * 100 / $total );
00149     print "\n";
00150     print "$success tests successful ($percSucc%)\n";
00151     print "$failure tests failed ($percFail%)\n\n";
00152     $ok = ( $success > 0 && $failure == 0 );
00153     $total = 0;
00154     $success = 0;
00155     $failure = 0;
00156 
00157     return $ok;
00158 }
00159 
00160 function testNormals( &$u, $c, $comment, $verbose, $reportFailure = false ) {
00161     $result = testNFC( $u, $c, $comment, $reportFailure );
00162     $result = testNFD( $u, $c, $comment, $reportFailure ) && $result;
00163     $result = testNFKC( $u, $c, $comment, $reportFailure ) && $result;
00164     $result = testNFKD( $u, $c, $comment, $reportFailure ) && $result;
00165     $result = testCleanUp( $u, $c, $comment, $reportFailure ) && $result;
00166 
00167     if ( $verbose && !$result && !$reportFailure ) {
00168         print $comment;
00169         testNormals( $u, $c, $comment, $verbose, true );
00170     }
00171 
00172     return $result;
00173 }
00174 
00175 function verbosify( $a, $b, $col, $form, $verbose ) {
00176     #$result = ($a === $b);
00177     $result = ( strcmp( $a, $b ) == 0 );
00178     if ( $verbose ) {
00179         $aa = pretty( $a );
00180         $bb = pretty( $b );
00181         $ok = $result ? "succeed" : " failed";
00182         $eq = $result ? "==" : "!=";
00183         print "  $ok $form c$col '$aa' $eq '$bb'\n";
00184     }
00185 
00186     return $result;
00187 }
00188 
00189 function testNFC( &$u, $c, $comment, $verbose ) {
00190     $result = verbosify( $c[2], $u->toNFC( $c[1] ), 1, 'NFC', $verbose );
00191     $result = verbosify( $c[2], $u->toNFC( $c[2] ), 2, 'NFC', $verbose ) && $result;
00192     $result = verbosify( $c[2], $u->toNFC( $c[3] ), 3, 'NFC', $verbose ) && $result;
00193     $result = verbosify( $c[4], $u->toNFC( $c[4] ), 4, 'NFC', $verbose ) && $result;
00194     $result = verbosify( $c[4], $u->toNFC( $c[5] ), 5, 'NFC', $verbose ) && $result;
00195 
00196     return $result;
00197 }
00198 
00199 function testCleanUp( &$u, $c, $comment, $verbose ) {
00200     $x = $c[1];
00201     $result = verbosify( $c[2], $u->cleanUp( $x ), 1, 'cleanUp', $verbose );
00202     $x = $c[2];
00203     $result = verbosify( $c[2], $u->cleanUp( $x ), 2, 'cleanUp', $verbose ) && $result;
00204     $x = $c[3];
00205     $result = verbosify( $c[2], $u->cleanUp( $x ), 3, 'cleanUp', $verbose ) && $result;
00206     $x = $c[4];
00207     $result = verbosify( $c[4], $u->cleanUp( $x ), 4, 'cleanUp', $verbose ) && $result;
00208     $x = $c[5];
00209     $result = verbosify( $c[4], $u->cleanUp( $x ), 5, 'cleanUp', $verbose ) && $result;
00210 
00211     return $result;
00212 }
00213 
00214 function testNFD( &$u, $c, $comment, $verbose ) {
00215     $result = verbosify( $c[3], $u->toNFD( $c[1] ), 1, 'NFD', $verbose );
00216     $result = verbosify( $c[3], $u->toNFD( $c[2] ), 2, 'NFD', $verbose ) && $result;
00217     $result = verbosify( $c[3], $u->toNFD( $c[3] ), 3, 'NFD', $verbose ) && $result;
00218     $result = verbosify( $c[5], $u->toNFD( $c[4] ), 4, 'NFD', $verbose ) && $result;
00219     $result = verbosify( $c[5], $u->toNFD( $c[5] ), 5, 'NFD', $verbose ) && $result;
00220 
00221     return $result;
00222 }
00223 
00224 function testNFKC( &$u, $c, $comment, $verbose ) {
00225     $result = verbosify( $c[4], $u->toNFKC( $c[1] ), 1, 'NFKC', $verbose );
00226     $result = verbosify( $c[4], $u->toNFKC( $c[2] ), 2, 'NFKC', $verbose ) && $result;
00227     $result = verbosify( $c[4], $u->toNFKC( $c[3] ), 3, 'NFKC', $verbose ) && $result;
00228     $result = verbosify( $c[4], $u->toNFKC( $c[4] ), 4, 'NFKC', $verbose ) && $result;
00229     $result = verbosify( $c[4], $u->toNFKC( $c[5] ), 5, 'NFKC', $verbose ) && $result;
00230 
00231     return $result;
00232 }
00233 
00234 function testNFKD( &$u, $c, $comment, $verbose ) {
00235     $result = verbosify( $c[5], $u->toNFKD( $c[1] ), 1, 'NFKD', $verbose );
00236     $result = verbosify( $c[5], $u->toNFKD( $c[2] ), 2, 'NFKD', $verbose ) && $result;
00237     $result = verbosify( $c[5], $u->toNFKD( $c[3] ), 3, 'NFKD', $verbose ) && $result;
00238     $result = verbosify( $c[5], $u->toNFKD( $c[4] ), 4, 'NFKD', $verbose ) && $result;
00239     $result = verbosify( $c[5], $u->toNFKD( $c[5] ), 5, 'NFKD', $verbose ) && $result;
00240 
00241     return $result;
00242 }
00243 
00244 function testInvariant( &$u, $char, $desc, $verbose, $reportFailure = false ) {
00245     $result = verbosify( $char, $u->toNFC( $char ), 1, 'NFC', $reportFailure );
00246     $result = verbosify( $char, $u->toNFD( $char ), 1, 'NFD', $reportFailure ) && $result;
00247     $result = verbosify( $char, $u->toNFKC( $char ), 1, 'NFKC', $reportFailure ) && $result;
00248     $result = verbosify( $char, $u->toNFKD( $char ), 1, 'NFKD', $reportFailure ) && $result;
00249     $result = verbosify( $char, $u->cleanUp( $char ), 1, 'cleanUp', $reportFailure ) && $result;
00250 
00251     if ( $verbose && !$result && !$reportFailure ) {
00252         print $desc;
00253         testInvariant( $u, $char, $desc, $verbose, true );
00254     }
00255 
00256     return $result;
00257 }