php/html/CleanUpTest_8php_source.html

00001 <?php
00041 class CleanUpTest extends MediaWikiTestCase {
00043     public function testAscii() {
00044         $text = 'This is plain ASCII text.';
00045         $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
00046     }
00047
00049     public function testNull() {
00050         $text = "a \x00 null";
00051         $expect = "a \xef\xbf\xbd null";
00052         $this->assertEquals(
00053             bin2hex( $expect ),
00054             bin2hex( UtfNormal::cleanUp( $text ) ) );
00055     }
00056
00058     public function testLatin() {
00059         $text = "L'\xc3\xa9cole";
00060         $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
00061     }
00062
00064     public function testLatinNormal() {
00065         $text = "L'e\xcc\x81cole";
00066         $expect = "L'\xc3\xa9cole";
00067         $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
00068     }
00069
00074     function XtestAllChars() {
00075         $rep = UTF8_REPLACEMENT;
00076         for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
00077             $char = codepointToUtf8( $i );
00078             $clean = UtfNormal::cleanUp( $char );
00079             $x = sprintf( "%04X", $i );
00080
00081             if ( $i % 0x1000 == 0 ) {
00082                 echo "U+$x\n";
00083             }
00084
00085             if ( $i == 0x0009 ||
00086                 $i == 0x000a ||
00087                 $i == 0x000d ||
00088                 ( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) ||
00089                 ( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
00090                 ( $i > 0xffff && $i <= UNICODE_MAX )
00091             ) {
00092                 if ( isset( UtfNormal::$utfCanonicalComp[$char] )
00093                     || isset( UtfNormal::$utfCanonicalDecomp[$char] )
00094                 ) {
00095                     $comp = UtfNormal::NFC( $char );
00096                     $this->assertEquals(
00097                         bin2hex( $comp ),
00098                         bin2hex( $clean ),
00099                         "U+$x should be decomposed" );
00100                 } else {
00101                     $this->assertEquals(
00102                         bin2hex( $char ),
00103                         bin2hex( $clean ),
00104                         "U+$x should be intact" );
00105                 }
00106             } else {
00107                 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
00108             }
00109         }
00110     }
00111
00113     public static function provideAllBytes() {
00114         return array(
00115             array( '', '' ),
00116             array( 'x', '' ),
00117             array( '', 'x' ),
00118             array( 'x', 'x' ),
00119         );
00120     }
00121
00126     function testBytes( $head, $tail ) {
00127         for ( $i = 0x0; $i < 256; $i++ ) {
00128             $char = $head . chr( $i ) . $tail;
00129             $clean = UtfNormal::cleanUp( $char );
00130             $x = sprintf( "%02X", $i );
00131
00132             if ( $i == 0x0009 ||
00133                 $i == 0x000a ||
00134                 $i == 0x000d ||
00135                 ( $i > 0x001f && $i < 0x80 )
00136             ) {
00137                 $this->assertEquals(
00138                     bin2hex( $char ),
00139                     bin2hex( $clean ),
00140                     "ASCII byte $x should be intact" );
00141                 if ( $char != $clean ) {
00142                     return;
00143                 }
00144             } else {
00145                 $norm = $head . UTF8_REPLACEMENT . $tail;
00146                 $this->assertEquals(
00147                     bin2hex( $norm ),
00148                     bin2hex( $clean ),
00149                     "Forbidden byte $x should be rejected" );
00150                 if ( $norm != $clean ) {
00151                     return;
00152                 }
00153             }
00154         }
00155     }
00156
00161     function testDoubleBytes( $head, $tail ) {
00162         for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
00163             for ( $second = 0x80; $second < 0x100; $second += 2 ) {
00164                 $char = $head . chr( $first ) . chr( $second ) . $tail;
00165                 $clean = UtfNormal::cleanUp( $char );
00166                 $x = sprintf( "%02X,%02X", $first, $second );
00167                 if ( $first > 0xc1 &&
00168                     $first < 0xe0 &&
00169                     $second < 0xc0
00170                 ) {
00171                     $norm = UtfNormal::NFC( $char );
00172                     $this->assertEquals(
00173                         bin2hex( $norm ),
00174                         bin2hex( $clean ),
00175                         "Pair $x should be intact" );
00176                     if ( $norm != $clean ) {
00177                         return;
00178                     }
00179                 } elseif ( $first > 0xfd || $second > 0xbf ) {
00180                     # fe and ff are not legal head bytes -- expect two replacement chars
00181                     $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
00182                     $this->assertEquals(
00183                         bin2hex( $norm ),
00184                         bin2hex( $clean ),
00185                         "Forbidden pair $x should be rejected" );
00186                     if ( $norm != $clean ) {
00187                         return;
00188                     }
00189                 } else {
00190                     $norm = $head . UTF8_REPLACEMENT . $tail;
00191                     $this->assertEquals(
00192                         bin2hex( $norm ),
00193                         bin2hex( $clean ),
00194                         "Forbidden pair $x should be rejected" );
00195                     if ( $norm != $clean ) {
00196                         return;
00197                     }
00198                 }
00199             }
00200         }
00201     }
00202
00207     function testTripleBytes( $head, $tail ) {
00208         for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
00209             for ( $second = 0x80; $second < 0x100; $second += 2 ) {
00210                 #for( $third = 0x80; $third < 0x100; $third++ ) {
00211                 for ( $third = 0x80; $third < 0x81; $third++ ) {
00212                     $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
00213                     $clean = UtfNormal::cleanUp( $char );
00214                     $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
00215
00216                     if ( $first >= 0xe0 &&
00217                         $first < 0xf0 &&
00218                         $second < 0xc0 &&
00219                         $third < 0xc0
00220                     ) {
00221                         if ( $first == 0xe0 && $second < 0xa0 ) {
00222                             $this->assertEquals(
00223                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00224                                 bin2hex( $clean ),
00225                                 "Overlong triplet $x should be rejected" );
00226                         } elseif ( $first == 0xed &&
00227                             ( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST
00228                         ) {
00229                             $this->assertEquals(
00230                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00231                                 bin2hex( $clean ),
00232                                 "Surrogate triplet $x should be rejected" );
00233                         } else {
00234                             $this->assertEquals(
00235                                 bin2hex( UtfNormal::NFC( $char ) ),
00236                                 bin2hex( $clean ),
00237                                 "Triplet $x should be intact" );
00238                         }
00239                     } elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
00240                         $this->assertEquals(
00241                             bin2hex( UtfNormal::NFC( $head . chr( $first ) .
00242                                     chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
00243                             bin2hex( $clean ),
00244                             "Valid 2-byte $x + broken tail" );
00245                     } elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
00246                         $this->assertEquals(
00247                             bin2hex( $head . UTF8_REPLACEMENT .
00248                                 UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
00249                             bin2hex( $clean ),
00250                             "Broken head + valid 2-byte $x" );
00251                     } elseif ( ( $first > 0xfd || $second > 0xfd ) &&
00252                         ( ( $second > 0xbf && $third > 0xbf ) ||
00253                             ( $second < 0xc0 && $third < 0xc0 ) ||
00254                             ( $second > 0xfd ) ||
00255                             ( $third > 0xfd ) )
00256                     ) {
00257                         # fe and ff are not legal head bytes -- expect three replacement chars
00258                         $this->assertEquals(
00259                             bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
00260                             bin2hex( $clean ),
00261                             "Forbidden triplet $x should be rejected" );
00262                     } elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
00263                         $this->assertEquals(
00264                             bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00265                             bin2hex( $clean ),
00266                             "Forbidden triplet $x should be rejected" );
00267                     } else {
00268                         $this->assertEquals(
00269                             bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
00270                             bin2hex( $clean ),
00271                             "Forbidden triplet $x should be rejected" );
00272                     }
00273                 }
00274             }
00275         }
00276     }
00277
00279     public function testChunkRegression() {
00280         # Check for regression against a chunking bug
00281         $text = "\x46\x55\xb8" .
00282             "\xdc\x96" .
00283             "\xee" .
00284             "\xe7" .
00285             "\x44" .
00286             "\xaa" .
00287             "\x2f\x25";
00288         $expect = "\x46\x55\xef\xbf\xbd" .
00289             "\xdc\x96" .
00290             "\xef\xbf\xbd" .
00291             "\xef\xbf\xbd" .
00292             "\x44" .
00293             "\xef\xbf\xbd" .
00294             "\x2f\x25";
00295
00296         $this->assertEquals(
00297             bin2hex( $expect ),
00298             bin2hex( UtfNormal::cleanUp( $text ) ) );
00299     }
00300
00302     public function testInterposeRegression() {
00303         $text = "\x4e\x30" .
00304             "\xb1" . # bad tail
00305             "\x3a" .
00306             "\x92" . # bad tail
00307             "\x62\x3a" .
00308             "\x84" . # bad tail
00309             "\x43" .
00310             "\xc6" . # bad head
00311             "\x3f" .
00312             "\x92" . # bad tail
00313             "\xad" . # bad tail
00314             "\x7d" .
00315             "\xd9\x95";
00316
00317         $expect = "\x4e\x30" .
00318             "\xef\xbf\xbd" .
00319             "\x3a" .
00320             "\xef\xbf\xbd" .
00321             "\x62\x3a" .
00322             "\xef\xbf\xbd" .
00323             "\x43" .
00324             "\xef\xbf\xbd" .
00325             "\x3f" .
00326             "\xef\xbf\xbd" .
00327             "\xef\xbf\xbd" .
00328             "\x7d" .
00329             "\xd9\x95";
00330
00331         $this->assertEquals(
00332             bin2hex( $expect ),
00333             bin2hex( UtfNormal::cleanUp( $text ) ) );
00334     }
00335
00337     public function testOverlongRegression() {
00338         $text = "\x67" .
00339             "\x1a" . # forbidden ascii
00340             "\xea" . # bad head
00341             "\xc1\xa6" . # overlong sequence
00342             "\xad" . # bad tail
00343             "\x1c" . # forbidden ascii
00344             "\xb0" . # bad tail
00345             "\x3c" .
00346             "\x9e"; # bad tail
00347         $expect = "\x67" .
00348             "\xef\xbf\xbd" .
00349             "\xef\xbf\xbd" .
00350             "\xef\xbf\xbd" .
00351             "\xef\xbf\xbd" .
00352             "\xef\xbf\xbd" .
00353             "\xef\xbf\xbd" .
00354             "\x3c" .
00355             "\xef\xbf\xbd";
00356         $this->assertEquals(
00357             bin2hex( $expect ),
00358             bin2hex( UtfNormal::cleanUp( $text ) ) );
00359     }
00360
00362     public function testSurrogateRegression() {
00363         $text = "\xed\xb4\x96" . # surrogate 0xDD16
00364             "\x83" . # bad tail
00365             "\xb4" . # bad tail
00366             "\xac"; # bad head
00367         $expect = "\xef\xbf\xbd" .
00368             "\xef\xbf\xbd" .
00369             "\xef\xbf\xbd" .
00370             "\xef\xbf\xbd";
00371         $this->assertEquals(
00372             bin2hex( $expect ),
00373             bin2hex( UtfNormal::cleanUp( $text ) ) );
00374     }
00375
00377     public function testBomRegression() {
00378         $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
00379             "\xb2" . # bad tail
00380             "\xef" . # bad head
00381             "\x59";
00382         $expect = "\xef\xbf\xbd" .
00383             "\xef\xbf\xbd" .
00384             "\xef\xbf\xbd" .
00385             "\x59";
00386         $this->assertEquals(
00387             bin2hex( $expect ),
00388             bin2hex( UtfNormal::cleanUp( $text ) ) );
00389     }
00390
00392     public function testForbiddenRegression() {
00393         $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
00394         $expect = "\xef\xbf\xbd";
00395         $this->assertEquals(
00396             bin2hex( $expect ),
00397             bin2hex( UtfNormal::cleanUp( $text ) ) );
00398     }
00399
00401     public function testHangulRegression() {
00402         $text = "\xed\x9c\xaf" . # Hangul char
00403             "\xe1\x87\x81"; # followed by another final jamo
00404         $expect = $text; # Should *not* change.
00405         $this->assertEquals(
00406             bin2hex( $expect ),
00407             bin2hex( UtfNormal::cleanUp( $text ) ) );
00408     }
00409 }