MediaWiki  REL1_22
CleanUpTest.php
Go to the documentation of this file.
00001 <?php
00038 class CleanUpTest extends MediaWikiTestCase {
00040     public function testAscii() {
00041         $text = 'This is plain ASCII text.';
00042         $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
00043     }
00044 
00046     public function testNull() {
00047         $text = "a \x00 null";
00048         $expect = "a \xef\xbf\xbd null";
00049         $this->assertEquals(
00050             bin2hex( $expect ),
00051             bin2hex( UtfNormal::cleanUp( $text ) ) );
00052     }
00053 
00055     public function testLatin() {
00056         $text = "L'\xc3\xa9cole";
00057         $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
00058     }
00059 
00061     public function testLatinNormal() {
00062         $text = "L'e\xcc\x81cole";
00063         $expect = "L'\xc3\xa9cole";
00064         $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
00065     }
00066 
00071     function XtestAllChars() {
00072         $rep = UTF8_REPLACEMENT;
00073         for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
00074             $char = codepointToUtf8( $i );
00075             $clean = UtfNormal::cleanUp( $char );
00076             $x = sprintf( "%04X", $i );
00077 
00078             if ( $i % 0x1000 == 0 ) {
00079                 echo "U+$x\n";
00080             }
00081 
00082             if ( $i == 0x0009 ||
00083                 $i == 0x000a ||
00084                 $i == 0x000d ||
00085                 ( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) ||
00086                 ( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
00087                 ( $i > 0xffff && $i <= UNICODE_MAX )
00088             ) {
00089                 if ( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) {
00090                     $comp = UtfNormal::NFC( $char );
00091                     $this->assertEquals(
00092                         bin2hex( $comp ),
00093                         bin2hex( $clean ),
00094                         "U+$x should be decomposed" );
00095                 } else {
00096                     $this->assertEquals(
00097                         bin2hex( $char ),
00098                         bin2hex( $clean ),
00099                         "U+$x should be intact" );
00100                 }
00101             } else {
00102                 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
00103             }
00104         }
00105     }
00106 
00108     public function testAllBytes() {
00109         $this->doTestBytes( '', '' );
00110         $this->doTestBytes( 'x', '' );
00111         $this->doTestBytes( '', 'x' );
00112         $this->doTestBytes( 'x', 'x' );
00113     }
00114 
00116     function doTestBytes( $head, $tail ) {
00117         for ( $i = 0x0; $i < 256; $i++ ) {
00118             $char = $head . chr( $i ) . $tail;
00119             $clean = UtfNormal::cleanUp( $char );
00120             $x = sprintf( "%02X", $i );
00121 
00122             if ( $i == 0x0009 ||
00123                 $i == 0x000a ||
00124                 $i == 0x000d ||
00125                 ( $i > 0x001f && $i < 0x80 )
00126             ) {
00127                 $this->assertEquals(
00128                     bin2hex( $char ),
00129                     bin2hex( $clean ),
00130                     "ASCII byte $x should be intact" );
00131                 if ( $char != $clean ) {
00132                     return;
00133                 }
00134             } else {
00135                 $norm = $head . UTF8_REPLACEMENT . $tail;
00136                 $this->assertEquals(
00137                     bin2hex( $norm ),
00138                     bin2hex( $clean ),
00139                     "Forbidden byte $x should be rejected" );
00140                 if ( $norm != $clean ) {
00141                     return;
00142                 }
00143             }
00144         }
00145     }
00146 
00148     public function testDoubleBytes() {
00149         $this->doTestDoubleBytes( '', '' );
00150         $this->doTestDoubleBytes( 'x', '' );
00151         $this->doTestDoubleBytes( '', 'x' );
00152         $this->doTestDoubleBytes( 'x', 'x' );
00153     }
00154 
00158     function doTestDoubleBytes( $head, $tail ) {
00159         for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
00160             for ( $second = 0x80; $second < 0x100; $second += 2 ) {
00161                 $char = $head . chr( $first ) . chr( $second ) . $tail;
00162                 $clean = UtfNormal::cleanUp( $char );
00163                 $x = sprintf( "%02X,%02X", $first, $second );
00164                 if ( $first > 0xc1 &&
00165                     $first < 0xe0 &&
00166                     $second < 0xc0
00167                 ) {
00168                     $norm = UtfNormal::NFC( $char );
00169                     $this->assertEquals(
00170                         bin2hex( $norm ),
00171                         bin2hex( $clean ),
00172                         "Pair $x should be intact" );
00173                     if ( $norm != $clean ) {
00174                         return;
00175                     }
00176                 } elseif ( $first > 0xfd || $second > 0xbf ) {
00177                     # fe and ff are not legal head bytes -- expect two replacement chars
00178                     $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
00179                     $this->assertEquals(
00180                         bin2hex( $norm ),
00181                         bin2hex( $clean ),
00182                         "Forbidden pair $x should be rejected" );
00183                     if ( $norm != $clean ) {
00184                         return;
00185                     }
00186                 } else {
00187                     $norm = $head . UTF8_REPLACEMENT . $tail;
00188                     $this->assertEquals(
00189                         bin2hex( $norm ),
00190                         bin2hex( $clean ),
00191                         "Forbidden pair $x should be rejected" );
00192                     if ( $norm != $clean ) {
00193                         return;
00194                     }
00195                 }
00196             }
00197         }
00198     }
00199 
00201     public function testTripleBytes() {
00202         $this->doTestTripleBytes( '', '' );
00203         $this->doTestTripleBytes( 'x', '' );
00204         $this->doTestTripleBytes( '', 'x' );
00205         $this->doTestTripleBytes( 'x', 'x' );
00206     }
00207 
00209     function doTestTripleBytes( $head, $tail ) {
00210         for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
00211             for ( $second = 0x80; $second < 0x100; $second += 2 ) {
00212                 #for( $third = 0x80; $third < 0x100; $third++ ) {
00213                 for ( $third = 0x80; $third < 0x81; $third++ ) {
00214                     $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
00215                     $clean = UtfNormal::cleanUp( $char );
00216                     $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
00217 
00218                     if ( $first >= 0xe0 &&
00219                         $first < 0xf0 &&
00220                         $second < 0xc0 &&
00221                         $third < 0xc0
00222                     ) {
00223                         if ( $first == 0xe0 && $second < 0xa0 ) {
00224                             $this->assertEquals(
00225                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00226                                 bin2hex( $clean ),
00227                                 "Overlong triplet $x should be rejected" );
00228                         } elseif ( $first == 0xed &&
00229                             ( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST
00230                         ) {
00231                             $this->assertEquals(
00232                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00233                                 bin2hex( $clean ),
00234                                 "Surrogate triplet $x should be rejected" );
00235                         } else {
00236                             $this->assertEquals(
00237                                 bin2hex( UtfNormal::NFC( $char ) ),
00238                                 bin2hex( $clean ),
00239                                 "Triplet $x should be intact" );
00240                         }
00241                     } elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
00242                         $this->assertEquals(
00243                             bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
00244                             bin2hex( $clean ),
00245                             "Valid 2-byte $x + broken tail" );
00246                     } elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
00247                         $this->assertEquals(
00248                             bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
00249                             bin2hex( $clean ),
00250                             "Broken head + valid 2-byte $x" );
00251                     } elseif ( ( $first > 0xfd || $second > 0xfd ) &&
00252                         ( ( $second > 0xbf && $third > 0xbf ) ||
00253                             ( $second < 0xc0 && $third < 0xc0 ) ||
00254                             ( $second > 0xfd ) ||
00255                             ( $third > 0xfd ) )
00256                     ) {
00257                         # fe and ff are not legal head bytes -- expect three replacement chars
00258                         $this->assertEquals(
00259                             bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
00260                             bin2hex( $clean ),
00261                             "Forbidden triplet $x should be rejected" );
00262                     } elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
00263                         $this->assertEquals(
00264                             bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00265                             bin2hex( $clean ),
00266                             "Forbidden triplet $x should be rejected" );
00267                     } else {
00268                         $this->assertEquals(
00269                             bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
00270                             bin2hex( $clean ),
00271                             "Forbidden triplet $x should be rejected" );
00272                     }
00273                 }
00274             }
00275         }
00276     }
00277 
00279     public function testChunkRegression() {
00280         # Check for regression against a chunking bug
00281         $text = "\x46\x55\xb8" .
00282             "\xdc\x96" .
00283             "\xee" .
00284             "\xe7" .
00285             "\x44" .
00286             "\xaa" .
00287             "\x2f\x25";
00288         $expect = "\x46\x55\xef\xbf\xbd" .
00289             "\xdc\x96" .
00290             "\xef\xbf\xbd" .
00291             "\xef\xbf\xbd" .
00292             "\x44" .
00293             "\xef\xbf\xbd" .
00294             "\x2f\x25";
00295 
00296         $this->assertEquals(
00297             bin2hex( $expect ),
00298             bin2hex( UtfNormal::cleanUp( $text ) ) );
00299     }
00300 
00302     public function testInterposeRegression() {
00303         $text = "\x4e\x30" .
00304             "\xb1" . # bad tail
00305             "\x3a" .
00306             "\x92" . # bad tail
00307             "\x62\x3a" .
00308             "\x84" . # bad tail
00309             "\x43" .
00310             "\xc6" . # bad head
00311             "\x3f" .
00312             "\x92" . # bad tail
00313             "\xad" . # bad tail
00314             "\x7d" .
00315             "\xd9\x95";
00316 
00317         $expect = "\x4e\x30" .
00318             "\xef\xbf\xbd" .
00319             "\x3a" .
00320             "\xef\xbf\xbd" .
00321             "\x62\x3a" .
00322             "\xef\xbf\xbd" .
00323             "\x43" .
00324             "\xef\xbf\xbd" .
00325             "\x3f" .
00326             "\xef\xbf\xbd" .
00327             "\xef\xbf\xbd" .
00328             "\x7d" .
00329             "\xd9\x95";
00330 
00331         $this->assertEquals(
00332             bin2hex( $expect ),
00333             bin2hex( UtfNormal::cleanUp( $text ) ) );
00334     }
00335 
00337     public function testOverlongRegression() {
00338         $text = "\x67" .
00339             "\x1a" . # forbidden ascii
00340             "\xea" . # bad head
00341             "\xc1\xa6" . # overlong sequence
00342             "\xad" . # bad tail
00343             "\x1c" . # forbidden ascii
00344             "\xb0" . # bad tail
00345             "\x3c" .
00346             "\x9e"; # bad tail
00347         $expect = "\x67" .
00348             "\xef\xbf\xbd" .
00349             "\xef\xbf\xbd" .
00350             "\xef\xbf\xbd" .
00351             "\xef\xbf\xbd" .
00352             "\xef\xbf\xbd" .
00353             "\xef\xbf\xbd" .
00354             "\x3c" .
00355             "\xef\xbf\xbd";
00356         $this->assertEquals(
00357             bin2hex( $expect ),
00358             bin2hex( UtfNormal::cleanUp( $text ) ) );
00359     }
00360 
00362     public function testSurrogateRegression() {
00363         $text = "\xed\xb4\x96" . # surrogate 0xDD16
00364             "\x83" . # bad tail
00365             "\xb4" . # bad tail
00366             "\xac"; # bad head
00367         $expect = "\xef\xbf\xbd" .
00368             "\xef\xbf\xbd" .
00369             "\xef\xbf\xbd" .
00370             "\xef\xbf\xbd";
00371         $this->assertEquals(
00372             bin2hex( $expect ),
00373             bin2hex( UtfNormal::cleanUp( $text ) ) );
00374     }
00375 
00377     public function testBomRegression() {
00378         $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
00379             "\xb2" . # bad tail
00380             "\xef" . # bad head
00381             "\x59";
00382         $expect = "\xef\xbf\xbd" .
00383             "\xef\xbf\xbd" .
00384             "\xef\xbf\xbd" .
00385             "\x59";
00386         $this->assertEquals(
00387             bin2hex( $expect ),
00388             bin2hex( UtfNormal::cleanUp( $text ) ) );
00389     }
00390 
00392     public function testForbiddenRegression() {
00393         $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
00394         $expect = "\xef\xbf\xbd";
00395         $this->assertEquals(
00396             bin2hex( $expect ),
00397             bin2hex( UtfNormal::cleanUp( $text ) ) );
00398     }
00399 
00401     public function testHangulRegression() {
00402         $text = "\xed\x9c\xaf" . # Hangul char
00403             "\xe1\x87\x81"; # followed by another final jamo
00404         $expect = $text; # Should *not* change.
00405         $this->assertEquals(
00406             bin2hex( $expect ),
00407             bin2hex( UtfNormal::cleanUp( $text ) ) );
00408     }
00409 }