MediaWiki  REL1_20
UtfNormalTest.php
Go to the documentation of this file.
00001 <?php
00028 $verbose = true;
00029 #define( 'PRETTY_UTF8', true );
00030 
00031 if( defined( 'PRETTY_UTF8' ) ) {
00032         function pretty( $string ) {
00033                 return preg_replace( '/([\x00-\xff])/e',
00034                         'sprintf("%02X", ord("$1"))',
00035                         $string );
00036         }
00037 } else {
00042         function pretty( $string ) {
00043                 return trim( preg_replace( '/(.)/use',
00044                         'sprintf("%04X ", utf8ToCodepoint("$1"))',
00045                         $string ) );
00046         }
00047 }
00048 
00049 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
00050         dl( 'php_utfnormal.so' );
00051 }
00052 
00053 require_once 'UtfNormalDefines.php';
00054 require_once 'UtfNormalUtil.php';
00055 require_once 'UtfNormal.php';
00056 
00057 if( php_sapi_name() != 'cli' ) {
00058         die( "Run me from the command line please.\n" );
00059 }
00060 
00061 $in = fopen("NormalizationTest.txt", "rt");
00062 if( !$in ) {
00063         print "Couldn't open NormalizationTest.txt -- can't run tests.\n";
00064         print "If necessary, manually download this file. It can be obtained at\n";
00065         print "http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt";
00066         exit(-1);
00067 }
00068 
00069 $normalizer = new UtfNormal;
00070 
00071 $total = 0;
00072 $success = 0;
00073 $failure = 0;
00074 $ok = true;
00075 $testedChars = array();
00076 while( false !== ( $line = fgets( $in ) ) ) {
00077         list( $data, $comment ) = explode( '#', $line );
00078         if( $data === '' ) continue;
00079         $matches = array();
00080         if( preg_match( '/@Part([\d])/', $data, $matches ) ) {
00081                 if( $matches[1] > 0 ) {
00082                         $ok = reportResults( $total, $success, $failure ) && $ok;
00083                 }
00084                 print "Part {$matches[1]}: $comment";
00085                 continue;
00086         }
00087 
00088         $columns = array_map( "hexSequenceToUtf8", explode( ";", $data ) );
00089         array_unshift( $columns, '' );
00090 
00091         $testedChars[$columns[1]] = true;
00092         $total++;
00093         if( testNormals( $normalizer, $columns, $comment, $verbose ) ) {
00094                 $success++;
00095         } else {
00096                 $failure++;
00097                 # print "FAILED: $comment";
00098         }
00099         if( $total % 100 == 0 ) print "$total ";
00100 }
00101 fclose( $in );
00102 
00103 $ok = reportResults( $total, $success, $failure ) && $ok;
00104 
00105 $in = fopen("UnicodeData.txt", "rt" );
00106 if( !$in ) {
00107         print "Can't open UnicodeData.txt for reading.\n";
00108         print "If necessary, fetch this file from the internet:\n";
00109         print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
00110         exit(-1);
00111 }
00112 print "Now testing invariants...\n";
00113 while( false !== ($line = fgets( $in ) ) ) {
00114         $cols = explode( ';', $line );
00115         $char = codepointToUtf8( hexdec( $cols[0] ) );
00116         $desc = $cols[0] . ": " . $cols[1];
00117         if( $char < "\x20" || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
00118                 # Can't check NULL with the ICU plugin, as null bytes fail in C land.
00119                 # Skip other control characters, as we strip them for XML safety.
00120                 # Surrogates are illegal on their own or in UTF-8, ignore.
00121                 continue;
00122         }
00123         if( empty( $testedChars[$char] ) ) {
00124                 $total++;
00125                 if( testInvariant( $normalizer, $char, $desc, $verbose ) ) {
00126                         $success++;
00127                 } else {
00128                         $failure++;
00129                 }
00130                 if( $total % 100 == 0 ) print "$total ";
00131         }
00132 }
00133 fclose( $in );
00134 
00135 $ok = reportResults( $total, $success, $failure ) && $ok;
00136 
00137 if( $ok ) {
00138         print "TEST SUCCEEDED!\n";
00139         exit(0);
00140 } else {
00141         print "TEST FAILED!\n";
00142         exit(-1);
00143 }
00144 
00145 ## ------
00146 
00147 function reportResults( &$total, &$success, &$failure ) {
00148         $percSucc = intval( $success * 100 / $total );
00149         $percFail = intval( $failure * 100 / $total );
00150         print "\n";
00151         print "$success tests successful ($percSucc%)\n";
00152         print "$failure tests failed ($percFail%)\n\n";
00153         $ok = ($success > 0 && $failure == 0);
00154         $total = 0;
00155         $success = 0;
00156         $failure = 0;
00157         return $ok;
00158 }
00159 
00160 function testNormals( &$u, $c, $comment, $verbose, $reportFailure = false ) {
00161         $result = testNFC( $u, $c, $comment, $reportFailure );
00162         $result = testNFD( $u, $c, $comment, $reportFailure ) && $result;
00163         $result = testNFKC( $u, $c, $comment, $reportFailure ) && $result;
00164         $result = testNFKD( $u, $c, $comment, $reportFailure ) && $result;
00165         $result = testCleanUp( $u, $c, $comment, $reportFailure ) && $result;
00166 
00167         if( $verbose && !$result && !$reportFailure ) {
00168                 print $comment;
00169                 testNormals( $u, $c, $comment, $verbose, true );
00170         }
00171         return $result;
00172 }
00173 
00174 function verbosify( $a, $b, $col, $form, $verbose ) {
00175         #$result = ($a === $b);
00176         $result = (strcmp( $a, $b ) == 0);
00177         if( $verbose ) {
00178                 $aa = pretty( $a );
00179                 $bb = pretty( $b );
00180                 $ok = $result ? "succeed" : " failed";
00181                 $eq = $result ? "==" : "!=";
00182                 print "  $ok $form c$col '$aa' $eq '$bb'\n";
00183         }
00184         return $result;
00185 }
00186 
00187 function testNFC( &$u, $c, $comment, $verbose ) {
00188         $result = verbosify( $c[2], $u->toNFC( $c[1] ), 1, 'NFC', $verbose );
00189         $result = verbosify( $c[2], $u->toNFC( $c[2] ), 2, 'NFC', $verbose ) && $result;
00190         $result = verbosify( $c[2], $u->toNFC( $c[3] ), 3, 'NFC', $verbose ) && $result;
00191         $result = verbosify( $c[4], $u->toNFC( $c[4] ), 4, 'NFC', $verbose ) && $result;
00192         $result = verbosify( $c[4], $u->toNFC( $c[5] ), 5, 'NFC', $verbose ) && $result;
00193         return $result;
00194 }
00195 
00196 function testCleanUp( &$u, $c, $comment, $verbose ) {
00197         $x = $c[1];
00198         $result = verbosify( $c[2], $u->cleanUp( $x ), 1, 'cleanUp', $verbose );
00199         $x = $c[2];
00200         $result = verbosify( $c[2], $u->cleanUp( $x ), 2, 'cleanUp', $verbose ) && $result;
00201         $x = $c[3];
00202         $result = verbosify( $c[2], $u->cleanUp( $x ), 3, 'cleanUp', $verbose ) && $result;
00203         $x = $c[4];
00204         $result = verbosify( $c[4], $u->cleanUp( $x ), 4, 'cleanUp', $verbose ) && $result;
00205         $x = $c[5];
00206         $result = verbosify( $c[4], $u->cleanUp( $x ), 5, 'cleanUp', $verbose ) && $result;
00207         return $result;
00208 }
00209 
00210 function testNFD( &$u, $c, $comment, $verbose ) {
00211         $result = verbosify( $c[3], $u->toNFD( $c[1] ), 1, 'NFD', $verbose );
00212         $result = verbosify( $c[3], $u->toNFD( $c[2] ), 2, 'NFD', $verbose ) && $result;
00213         $result = verbosify( $c[3], $u->toNFD( $c[3] ), 3, 'NFD', $verbose ) && $result;
00214         $result = verbosify( $c[5], $u->toNFD( $c[4] ), 4, 'NFD', $verbose ) && $result;
00215         $result = verbosify( $c[5], $u->toNFD( $c[5] ), 5, 'NFD', $verbose ) && $result;
00216         return $result;
00217 }
00218 
00219 function testNFKC( &$u, $c, $comment, $verbose ) {
00220         $result = verbosify( $c[4], $u->toNFKC( $c[1] ), 1, 'NFKC', $verbose );
00221         $result = verbosify( $c[4], $u->toNFKC( $c[2] ), 2, 'NFKC', $verbose ) && $result;
00222         $result = verbosify( $c[4], $u->toNFKC( $c[3] ), 3, 'NFKC', $verbose ) && $result;
00223         $result = verbosify( $c[4], $u->toNFKC( $c[4] ), 4, 'NFKC', $verbose ) && $result;
00224         $result = verbosify( $c[4], $u->toNFKC( $c[5] ), 5, 'NFKC', $verbose ) && $result;
00225         return $result;
00226 }
00227 
00228 function testNFKD( &$u, $c, $comment, $verbose ) {
00229         $result = verbosify( $c[5], $u->toNFKD( $c[1] ), 1, 'NFKD', $verbose );
00230         $result = verbosify( $c[5], $u->toNFKD( $c[2] ), 2, 'NFKD', $verbose ) && $result;
00231         $result = verbosify( $c[5], $u->toNFKD( $c[3] ), 3, 'NFKD', $verbose ) && $result;
00232         $result = verbosify( $c[5], $u->toNFKD( $c[4] ), 4, 'NFKD', $verbose ) && $result;
00233         $result = verbosify( $c[5], $u->toNFKD( $c[5] ), 5, 'NFKD', $verbose ) && $result;
00234         return $result;
00235 }
00236 
00237 function testInvariant( &$u, $char, $desc, $verbose, $reportFailure = false ) {
00238         $result = verbosify( $char, $u->toNFC( $char ), 1, 'NFC', $reportFailure );
00239         $result = verbosify( $char, $u->toNFD( $char ), 1, 'NFD', $reportFailure ) && $result;
00240         $result = verbosify( $char, $u->toNFKC( $char ), 1, 'NFKC', $reportFailure ) && $result;
00241         $result = verbosify( $char, $u->toNFKD( $char ), 1, 'NFKD', $reportFailure ) && $result;
00242         $result = verbosify( $char, $u->cleanUp( $char ), 1, 'cleanUp', $reportFailure ) && $result;
00243 
00244         if( $verbose && !$result && !$reportFailure ) {
00245                 print $desc;
00246                 testInvariant( $u, $char, $desc, $verbose, true );
00247         }
00248         return $result;
00249 }