MediaWiki  REL1_19
UtfNormalTest.php
Go to the documentation of this file.
00001 <?php
00028 $verbose = true;
00029 #define( 'PRETTY_UTF8', true );
00030 
00031 if( defined( 'PRETTY_UTF8' ) ) {
00032         function pretty( $string ) {
00033                 return preg_replace( '/([\x00-\xff])/e',
00034                         'sprintf("%02X", ord("$1"))',
00035                         $string );
00036         }
00037 } else {
00041         function pretty( $string ) {
00042                 return trim( preg_replace( '/(.)/use',
00043                         'sprintf("%04X ", utf8ToCodepoint("$1"))',
00044                         $string ) );
00045         }
00046 }
00047 
00048 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
00049         dl( 'php_utfnormal.so' );
00050 }
00051 
00052 require_once 'UtfNormalDefines.php';
00053 require_once 'UtfNormalUtil.php';
00054 require_once 'UtfNormal.php';
00055 
00056 if( php_sapi_name() != 'cli' ) {
00057         die( "Run me from the command line please.\n" );
00058 }
00059 
00060 $in = fopen("NormalizationTest.txt", "rt");
00061 if( !$in ) {
00062         print "Couldn't open NormalizationTest.txt -- can't run tests.\n";
00063         print "If necessary, manually download this file. It can be obtained at\n";
00064         print "http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt";
00065         exit(-1);
00066 }
00067 
00068 $normalizer = new UtfNormal;
00069 
00070 $total = 0;
00071 $success = 0;
00072 $failure = 0;
00073 $ok = true;
00074 $testedChars = array();
00075 while( false !== ( $line = fgets( $in ) ) ) {
00076         list( $data, $comment ) = explode( '#', $line );
00077         if( $data === '' ) continue;
00078         $matches = array();
00079         if( preg_match( '/@Part([\d])/', $data, $matches ) ) {
00080                 if( $matches[1] > 0 ) {
00081                         $ok = reportResults( $total, $success, $failure ) && $ok;
00082                 }
00083                 print "Part {$matches[1]}: $comment";
00084                 continue;
00085         }
00086 
00087         $columns = array_map( "hexSequenceToUtf8", explode( ";", $data ) );
00088         array_unshift( $columns, '' );
00089 
00090         $testedChars[$columns[1]] = true;
00091         $total++;
00092         if( testNormals( $normalizer, $columns, $comment, $verbose ) ) {
00093                 $success++;
00094         } else {
00095                 $failure++;
00096                 # print "FAILED: $comment";
00097         }
00098         if( $total % 100 == 0 ) print "$total ";
00099 }
00100 fclose( $in );
00101 
00102 $ok = reportResults( $total, $success, $failure ) && $ok;
00103 
00104 $in = fopen("UnicodeData.txt", "rt" );
00105 if( !$in ) {
00106         print "Can't open UnicodeData.txt for reading.\n";
00107         print "If necessary, fetch this file from the internet:\n";
00108         print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
00109         exit(-1);
00110 }
00111 print "Now testing invariants...\n";
00112 while( false !== ($line = fgets( $in ) ) ) {
00113         $cols = explode( ';', $line );
00114         $char = codepointToUtf8( hexdec( $cols[0] ) );
00115         $desc = $cols[0] . ": " . $cols[1];
00116         if( $char < "\x20" || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
00117                 # Can't check NULL with the ICU plugin, as null bytes fail in C land.
00118                 # Skip other control characters, as we strip them for XML safety.
00119                 # Surrogates are illegal on their own or in UTF-8, ignore.
00120                 continue;
00121         }
00122         if( empty( $testedChars[$char] ) ) {
00123                 $total++;
00124                 if( testInvariant( $normalizer, $char, $desc, $verbose ) ) {
00125                         $success++;
00126                 } else {
00127                         $failure++;
00128                 }
00129                 if( $total % 100 == 0 ) print "$total ";
00130         }
00131 }
00132 fclose( $in );
00133 
00134 $ok = reportResults( $total, $success, $failure ) && $ok;
00135 
00136 if( $ok ) {
00137         print "TEST SUCCEEDED!\n";
00138         exit(0);
00139 } else {
00140         print "TEST FAILED!\n";
00141         exit(-1);
00142 }
00143 
00144 ## ------
00145 
00146 function reportResults( &$total, &$success, &$failure ) {
00147         $percSucc = intval( $success * 100 / $total );
00148         $percFail = intval( $failure * 100 / $total );
00149         print "\n";
00150         print "$success tests successful ($percSucc%)\n";
00151         print "$failure tests failed ($percFail%)\n\n";
00152         $ok = ($success > 0 && $failure == 0);
00153         $total = 0;
00154         $success = 0;
00155         $failure = 0;
00156         return $ok;
00157 }
00158 
00159 function testNormals( &$u, $c, $comment, $verbose, $reportFailure = false ) {
00160         $result = testNFC( $u, $c, $comment, $reportFailure );
00161         $result = testNFD( $u, $c, $comment, $reportFailure ) && $result;
00162         $result = testNFKC( $u, $c, $comment, $reportFailure ) && $result;
00163         $result = testNFKD( $u, $c, $comment, $reportFailure ) && $result;
00164         $result = testCleanUp( $u, $c, $comment, $reportFailure ) && $result;
00165 
00166         if( $verbose && !$result && !$reportFailure ) {
00167                 print $comment;
00168                 testNormals( $u, $c, $comment, $verbose, true );
00169         }
00170         return $result;
00171 }
00172 
00173 function verbosify( $a, $b, $col, $form, $verbose ) {
00174         #$result = ($a === $b);
00175         $result = (strcmp( $a, $b ) == 0);
00176         if( $verbose ) {
00177                 $aa = pretty( $a );
00178                 $bb = pretty( $b );
00179                 $ok = $result ? "succeed" : " failed";
00180                 $eq = $result ? "==" : "!=";
00181                 print "  $ok $form c$col '$aa' $eq '$bb'\n";
00182         }
00183         return $result;
00184 }
00185 
00186 function testNFC( &$u, $c, $comment, $verbose ) {
00187         $result = verbosify( $c[2], $u->toNFC( $c[1] ), 1, 'NFC', $verbose );
00188         $result = verbosify( $c[2], $u->toNFC( $c[2] ), 2, 'NFC', $verbose ) && $result;
00189         $result = verbosify( $c[2], $u->toNFC( $c[3] ), 3, 'NFC', $verbose ) && $result;
00190         $result = verbosify( $c[4], $u->toNFC( $c[4] ), 4, 'NFC', $verbose ) && $result;
00191         $result = verbosify( $c[4], $u->toNFC( $c[5] ), 5, 'NFC', $verbose ) && $result;
00192         return $result;
00193 }
00194 
00195 function testCleanUp( &$u, $c, $comment, $verbose ) {
00196         $x = $c[1];
00197         $result = verbosify( $c[2], $u->cleanUp( $x ), 1, 'cleanUp', $verbose );
00198         $x = $c[2];
00199         $result = verbosify( $c[2], $u->cleanUp( $x ), 2, 'cleanUp', $verbose ) && $result;
00200         $x = $c[3];
00201         $result = verbosify( $c[2], $u->cleanUp( $x ), 3, 'cleanUp', $verbose ) && $result;
00202         $x = $c[4];
00203         $result = verbosify( $c[4], $u->cleanUp( $x ), 4, 'cleanUp', $verbose ) && $result;
00204         $x = $c[5];
00205         $result = verbosify( $c[4], $u->cleanUp( $x ), 5, 'cleanUp', $verbose ) && $result;
00206         return $result;
00207 }
00208 
00209 function testNFD( &$u, $c, $comment, $verbose ) {
00210         $result = verbosify( $c[3], $u->toNFD( $c[1] ), 1, 'NFD', $verbose );
00211         $result = verbosify( $c[3], $u->toNFD( $c[2] ), 2, 'NFD', $verbose ) && $result;
00212         $result = verbosify( $c[3], $u->toNFD( $c[3] ), 3, 'NFD', $verbose ) && $result;
00213         $result = verbosify( $c[5], $u->toNFD( $c[4] ), 4, 'NFD', $verbose ) && $result;
00214         $result = verbosify( $c[5], $u->toNFD( $c[5] ), 5, 'NFD', $verbose ) && $result;
00215         return $result;
00216 }
00217 
00218 function testNFKC( &$u, $c, $comment, $verbose ) {
00219         $result = verbosify( $c[4], $u->toNFKC( $c[1] ), 1, 'NFKC', $verbose );
00220         $result = verbosify( $c[4], $u->toNFKC( $c[2] ), 2, 'NFKC', $verbose ) && $result;
00221         $result = verbosify( $c[4], $u->toNFKC( $c[3] ), 3, 'NFKC', $verbose ) && $result;
00222         $result = verbosify( $c[4], $u->toNFKC( $c[4] ), 4, 'NFKC', $verbose ) && $result;
00223         $result = verbosify( $c[4], $u->toNFKC( $c[5] ), 5, 'NFKC', $verbose ) && $result;
00224         return $result;
00225 }
00226 
00227 function testNFKD( &$u, $c, $comment, $verbose ) {
00228         $result = verbosify( $c[5], $u->toNFKD( $c[1] ), 1, 'NFKD', $verbose );
00229         $result = verbosify( $c[5], $u->toNFKD( $c[2] ), 2, 'NFKD', $verbose ) && $result;
00230         $result = verbosify( $c[5], $u->toNFKD( $c[3] ), 3, 'NFKD', $verbose ) && $result;
00231         $result = verbosify( $c[5], $u->toNFKD( $c[4] ), 4, 'NFKD', $verbose ) && $result;
00232         $result = verbosify( $c[5], $u->toNFKD( $c[5] ), 5, 'NFKD', $verbose ) && $result;
00233         return $result;
00234 }
00235 
00236 function testInvariant( &$u, $char, $desc, $verbose, $reportFailure = false ) {
00237         $result = verbosify( $char, $u->toNFC( $char ), 1, 'NFC', $reportFailure );
00238         $result = verbosify( $char, $u->toNFD( $char ), 1, 'NFD', $reportFailure ) && $result;
00239         $result = verbosify( $char, $u->toNFKC( $char ), 1, 'NFKC', $reportFailure ) && $result;
00240         $result = verbosify( $char, $u->toNFKD( $char ), 1, 'NFKD', $reportFailure ) && $result;
00241         $result = verbosify( $char, $u->cleanUp( $char ), 1, 'cleanUp', $reportFailure ) && $result;
00242 
00243         if( $verbose && !$result && !$reportFailure ) {
00244                 print $desc;
00245                 testInvariant( $u, $char, $desc, $verbose, true );
00246         }
00247         return $result;
00248 }