MediaWiki  REL1_23
CleanUpTest.php
Go to the documentation of this file.
00001 <?php
00041 class CleanUpTest extends MediaWikiTestCase {
00043     public function testAscii() {
00044         $text = 'This is plain ASCII text.';
00045         $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
00046     }
00047 
00049     public function testNull() {
00050         $text = "a \x00 null";
00051         $expect = "a \xef\xbf\xbd null";
00052         $this->assertEquals(
00053             bin2hex( $expect ),
00054             bin2hex( UtfNormal::cleanUp( $text ) ) );
00055     }
00056 
00058     public function testLatin() {
00059         $text = "L'\xc3\xa9cole";
00060         $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
00061     }
00062 
00064     public function testLatinNormal() {
00065         $text = "L'e\xcc\x81cole";
00066         $expect = "L'\xc3\xa9cole";
00067         $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
00068     }
00069 
00074     function XtestAllChars() {
00075         $rep = UTF8_REPLACEMENT;
00076         for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
00077             $char = codepointToUtf8( $i );
00078             $clean = UtfNormal::cleanUp( $char );
00079             $x = sprintf( "%04X", $i );
00080 
00081             if ( $i % 0x1000 == 0 ) {
00082                 echo "U+$x\n";
00083             }
00084 
00085             if ( $i == 0x0009 ||
00086                 $i == 0x000a ||
00087                 $i == 0x000d ||
00088                 ( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) ||
00089                 ( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
00090                 ( $i > 0xffff && $i <= UNICODE_MAX )
00091             ) {
00092                 if ( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) {
00093                     $comp = UtfNormal::NFC( $char );
00094                     $this->assertEquals(
00095                         bin2hex( $comp ),
00096                         bin2hex( $clean ),
00097                         "U+$x should be decomposed" );
00098                 } else {
00099                     $this->assertEquals(
00100                         bin2hex( $char ),
00101                         bin2hex( $clean ),
00102                         "U+$x should be intact" );
00103                 }
00104             } else {
00105                 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
00106             }
00107         }
00108     }
00109 
00111     public function testAllBytes() {
00112         $this->doTestBytes( '', '' );
00113         $this->doTestBytes( 'x', '' );
00114         $this->doTestBytes( '', 'x' );
00115         $this->doTestBytes( 'x', 'x' );
00116     }
00117 
00119     function doTestBytes( $head, $tail ) {
00120         for ( $i = 0x0; $i < 256; $i++ ) {
00121             $char = $head . chr( $i ) . $tail;
00122             $clean = UtfNormal::cleanUp( $char );
00123             $x = sprintf( "%02X", $i );
00124 
00125             if ( $i == 0x0009 ||
00126                 $i == 0x000a ||
00127                 $i == 0x000d ||
00128                 ( $i > 0x001f && $i < 0x80 )
00129             ) {
00130                 $this->assertEquals(
00131                     bin2hex( $char ),
00132                     bin2hex( $clean ),
00133                     "ASCII byte $x should be intact" );
00134                 if ( $char != $clean ) {
00135                     return;
00136                 }
00137             } else {
00138                 $norm = $head . UTF8_REPLACEMENT . $tail;
00139                 $this->assertEquals(
00140                     bin2hex( $norm ),
00141                     bin2hex( $clean ),
00142                     "Forbidden byte $x should be rejected" );
00143                 if ( $norm != $clean ) {
00144                     return;
00145                 }
00146             }
00147         }
00148     }
00149 
00151     public function testDoubleBytes() {
00152         $this->doTestDoubleBytes( '', '' );
00153         $this->doTestDoubleBytes( 'x', '' );
00154         $this->doTestDoubleBytes( '', 'x' );
00155         $this->doTestDoubleBytes( 'x', 'x' );
00156     }
00157 
00161     function doTestDoubleBytes( $head, $tail ) {
00162         for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
00163             for ( $second = 0x80; $second < 0x100; $second += 2 ) {
00164                 $char = $head . chr( $first ) . chr( $second ) . $tail;
00165                 $clean = UtfNormal::cleanUp( $char );
00166                 $x = sprintf( "%02X,%02X", $first, $second );
00167                 if ( $first > 0xc1 &&
00168                     $first < 0xe0 &&
00169                     $second < 0xc0
00170                 ) {
00171                     $norm = UtfNormal::NFC( $char );
00172                     $this->assertEquals(
00173                         bin2hex( $norm ),
00174                         bin2hex( $clean ),
00175                         "Pair $x should be intact" );
00176                     if ( $norm != $clean ) {
00177                         return;
00178                     }
00179                 } elseif ( $first > 0xfd || $second > 0xbf ) {
00180                     # fe and ff are not legal head bytes -- expect two replacement chars
00181                     $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
00182                     $this->assertEquals(
00183                         bin2hex( $norm ),
00184                         bin2hex( $clean ),
00185                         "Forbidden pair $x should be rejected" );
00186                     if ( $norm != $clean ) {
00187                         return;
00188                     }
00189                 } else {
00190                     $norm = $head . UTF8_REPLACEMENT . $tail;
00191                     $this->assertEquals(
00192                         bin2hex( $norm ),
00193                         bin2hex( $clean ),
00194                         "Forbidden pair $x should be rejected" );
00195                     if ( $norm != $clean ) {
00196                         return;
00197                     }
00198                 }
00199             }
00200         }
00201     }
00202 
00204     public function testTripleBytes() {
00205         $this->doTestTripleBytes( '', '' );
00206         $this->doTestTripleBytes( 'x', '' );
00207         $this->doTestTripleBytes( '', 'x' );
00208         $this->doTestTripleBytes( 'x', 'x' );
00209     }
00210 
00212     function doTestTripleBytes( $head, $tail ) {
00213         for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
00214             for ( $second = 0x80; $second < 0x100; $second += 2 ) {
00215                 #for( $third = 0x80; $third < 0x100; $third++ ) {
00216                 for ( $third = 0x80; $third < 0x81; $third++ ) {
00217                     $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
00218                     $clean = UtfNormal::cleanUp( $char );
00219                     $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
00220 
00221                     if ( $first >= 0xe0 &&
00222                         $first < 0xf0 &&
00223                         $second < 0xc0 &&
00224                         $third < 0xc0
00225                     ) {
00226                         if ( $first == 0xe0 && $second < 0xa0 ) {
00227                             $this->assertEquals(
00228                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00229                                 bin2hex( $clean ),
00230                                 "Overlong triplet $x should be rejected" );
00231                         } elseif ( $first == 0xed &&
00232                             ( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST
00233                         ) {
00234                             $this->assertEquals(
00235                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00236                                 bin2hex( $clean ),
00237                                 "Surrogate triplet $x should be rejected" );
00238                         } else {
00239                             $this->assertEquals(
00240                                 bin2hex( UtfNormal::NFC( $char ) ),
00241                                 bin2hex( $clean ),
00242                                 "Triplet $x should be intact" );
00243                         }
00244                     } elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
00245                         $this->assertEquals(
00246                             bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
00247                             bin2hex( $clean ),
00248                             "Valid 2-byte $x + broken tail" );
00249                     } elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
00250                         $this->assertEquals(
00251                             bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
00252                             bin2hex( $clean ),
00253                             "Broken head + valid 2-byte $x" );
00254                     } elseif ( ( $first > 0xfd || $second > 0xfd ) &&
00255                         ( ( $second > 0xbf && $third > 0xbf ) ||
00256                             ( $second < 0xc0 && $third < 0xc0 ) ||
00257                             ( $second > 0xfd ) ||
00258                             ( $third > 0xfd ) )
00259                     ) {
00260                         # fe and ff are not legal head bytes -- expect three replacement chars
00261                         $this->assertEquals(
00262                             bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
00263                             bin2hex( $clean ),
00264                             "Forbidden triplet $x should be rejected" );
00265                     } elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
00266                         $this->assertEquals(
00267                             bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00268                             bin2hex( $clean ),
00269                             "Forbidden triplet $x should be rejected" );
00270                     } else {
00271                         $this->assertEquals(
00272                             bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
00273                             bin2hex( $clean ),
00274                             "Forbidden triplet $x should be rejected" );
00275                     }
00276                 }
00277             }
00278         }
00279     }
00280 
00282     public function testChunkRegression() {
00283         # Check for regression against a chunking bug
00284         $text = "\x46\x55\xb8" .
00285             "\xdc\x96" .
00286             "\xee" .
00287             "\xe7" .
00288             "\x44" .
00289             "\xaa" .
00290             "\x2f\x25";
00291         $expect = "\x46\x55\xef\xbf\xbd" .
00292             "\xdc\x96" .
00293             "\xef\xbf\xbd" .
00294             "\xef\xbf\xbd" .
00295             "\x44" .
00296             "\xef\xbf\xbd" .
00297             "\x2f\x25";
00298 
00299         $this->assertEquals(
00300             bin2hex( $expect ),
00301             bin2hex( UtfNormal::cleanUp( $text ) ) );
00302     }
00303 
00305     public function testInterposeRegression() {
00306         $text = "\x4e\x30" .
00307             "\xb1" . # bad tail
00308             "\x3a" .
00309             "\x92" . # bad tail
00310             "\x62\x3a" .
00311             "\x84" . # bad tail
00312             "\x43" .
00313             "\xc6" . # bad head
00314             "\x3f" .
00315             "\x92" . # bad tail
00316             "\xad" . # bad tail
00317             "\x7d" .
00318             "\xd9\x95";
00319 
00320         $expect = "\x4e\x30" .
00321             "\xef\xbf\xbd" .
00322             "\x3a" .
00323             "\xef\xbf\xbd" .
00324             "\x62\x3a" .
00325             "\xef\xbf\xbd" .
00326             "\x43" .
00327             "\xef\xbf\xbd" .
00328             "\x3f" .
00329             "\xef\xbf\xbd" .
00330             "\xef\xbf\xbd" .
00331             "\x7d" .
00332             "\xd9\x95";
00333 
00334         $this->assertEquals(
00335             bin2hex( $expect ),
00336             bin2hex( UtfNormal::cleanUp( $text ) ) );
00337     }
00338 
00340     public function testOverlongRegression() {
00341         $text = "\x67" .
00342             "\x1a" . # forbidden ascii
00343             "\xea" . # bad head
00344             "\xc1\xa6" . # overlong sequence
00345             "\xad" . # bad tail
00346             "\x1c" . # forbidden ascii
00347             "\xb0" . # bad tail
00348             "\x3c" .
00349             "\x9e"; # bad tail
00350         $expect = "\x67" .
00351             "\xef\xbf\xbd" .
00352             "\xef\xbf\xbd" .
00353             "\xef\xbf\xbd" .
00354             "\xef\xbf\xbd" .
00355             "\xef\xbf\xbd" .
00356             "\xef\xbf\xbd" .
00357             "\x3c" .
00358             "\xef\xbf\xbd";
00359         $this->assertEquals(
00360             bin2hex( $expect ),
00361             bin2hex( UtfNormal::cleanUp( $text ) ) );
00362     }
00363 
00365     public function testSurrogateRegression() {
00366         $text = "\xed\xb4\x96" . # surrogate 0xDD16
00367             "\x83" . # bad tail
00368             "\xb4" . # bad tail
00369             "\xac"; # bad head
00370         $expect = "\xef\xbf\xbd" .
00371             "\xef\xbf\xbd" .
00372             "\xef\xbf\xbd" .
00373             "\xef\xbf\xbd";
00374         $this->assertEquals(
00375             bin2hex( $expect ),
00376             bin2hex( UtfNormal::cleanUp( $text ) ) );
00377     }
00378 
00380     public function testBomRegression() {
00381         $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
00382             "\xb2" . # bad tail
00383             "\xef" . # bad head
00384             "\x59";
00385         $expect = "\xef\xbf\xbd" .
00386             "\xef\xbf\xbd" .
00387             "\xef\xbf\xbd" .
00388             "\x59";
00389         $this->assertEquals(
00390             bin2hex( $expect ),
00391             bin2hex( UtfNormal::cleanUp( $text ) ) );
00392     }
00393 
00395     public function testForbiddenRegression() {
00396         $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
00397         $expect = "\xef\xbf\xbd";
00398         $this->assertEquals(
00399             bin2hex( $expect ),
00400             bin2hex( UtfNormal::cleanUp( $text ) ) );
00401     }
00402 
00404     public function testHangulRegression() {
00405         $text = "\xed\x9c\xaf" . # Hangul char
00406             "\xe1\x87\x81"; # followed by another final jamo
00407         $expect = $text; # Should *not* change.
00408         $this->assertEquals(
00409             bin2hex( $expect ),
00410             bin2hex( UtfNormal::cleanUp( $text ) ) );
00411     }
00412 }