MediaWiki  REL1_21
CleanUpTest.php
Go to the documentation of this file.
00001 <?php
00034 class CleanUpTest extends MediaWikiTestCase {
00036         function testAscii() {
00037                 $text = 'This is plain ASCII text.';
00038                 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
00039         }
00040 
00042         function testNull() {
00043                 $text = "a \x00 null";
00044                 $expect = "a \xef\xbf\xbd null";
00045                 $this->assertEquals(
00046                         bin2hex( $expect ),
00047                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00048         }
00049 
00051         function testLatin() {
00052                 $text = "L'\xc3\xa9cole";
00053                 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
00054         }
00055 
00057         function testLatinNormal() {
00058                 $text = "L'e\xcc\x81cole";
00059                 $expect = "L'\xc3\xa9cole";
00060                 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
00061         }
00062 
00067         function XtestAllChars() {
00068                 $rep = UTF8_REPLACEMENT;
00069                 for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
00070                         $char = codepointToUtf8( $i );
00071                         $clean = UtfNormal::cleanUp( $char );
00072                         $x = sprintf( "%04X", $i );
00073 
00074                         if ( $i % 0x1000 == 0 ) {
00075                                 echo "U+$x\n";
00076                         }
00077 
00078                         if ( $i == 0x0009 ||
00079                                 $i == 0x000a ||
00080                                 $i == 0x000d ||
00081                                 ( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) ||
00082                                 ( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
00083                                 ( $i > 0xffff && $i <= UNICODE_MAX )
00084                         ) {
00085                                 if ( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) {
00086                                         $comp = UtfNormal::NFC( $char );
00087                                         $this->assertEquals(
00088                                                 bin2hex( $comp ),
00089                                                 bin2hex( $clean ),
00090                                                 "U+$x should be decomposed" );
00091                                 } else {
00092                                         $this->assertEquals(
00093                                                 bin2hex( $char ),
00094                                                 bin2hex( $clean ),
00095                                                 "U+$x should be intact" );
00096                                 }
00097                         } else {
00098                                 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
00099                         }
00100                 }
00101         }
00102 
00104         function testAllBytes() {
00105                 $this->doTestBytes( '', '' );
00106                 $this->doTestBytes( 'x', '' );
00107                 $this->doTestBytes( '', 'x' );
00108                 $this->doTestBytes( 'x', 'x' );
00109         }
00110 
00112         function doTestBytes( $head, $tail ) {
00113                 for ( $i = 0x0; $i < 256; $i++ ) {
00114                         $char = $head . chr( $i ) . $tail;
00115                         $clean = UtfNormal::cleanUp( $char );
00116                         $x = sprintf( "%02X", $i );
00117 
00118                         if ( $i == 0x0009 ||
00119                                 $i == 0x000a ||
00120                                 $i == 0x000d ||
00121                                 ( $i > 0x001f && $i < 0x80 )
00122                         ) {
00123                                 $this->assertEquals(
00124                                         bin2hex( $char ),
00125                                         bin2hex( $clean ),
00126                                         "ASCII byte $x should be intact" );
00127                                 if ( $char != $clean ) {
00128                                         return;
00129                                 }
00130                         } else {
00131                                 $norm = $head . UTF8_REPLACEMENT . $tail;
00132                                 $this->assertEquals(
00133                                         bin2hex( $norm ),
00134                                         bin2hex( $clean ),
00135                                         "Forbidden byte $x should be rejected" );
00136                                 if ( $norm != $clean ) {
00137                                         return;
00138                                 }
00139                         }
00140                 }
00141         }
00142 
00144         function testDoubleBytes() {
00145                 $this->doTestDoubleBytes( '', '' );
00146                 $this->doTestDoubleBytes( 'x', '' );
00147                 $this->doTestDoubleBytes( '', 'x' );
00148                 $this->doTestDoubleBytes( 'x', 'x' );
00149         }
00150 
00154         function doTestDoubleBytes( $head, $tail ) {
00155                 for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
00156                         for ( $second = 0x80; $second < 0x100; $second += 2 ) {
00157                                 $char = $head . chr( $first ) . chr( $second ) . $tail;
00158                                 $clean = UtfNormal::cleanUp( $char );
00159                                 $x = sprintf( "%02X,%02X", $first, $second );
00160                                 if ( $first > 0xc1 &&
00161                                         $first < 0xe0 &&
00162                                         $second < 0xc0
00163                                 ) {
00164                                         $norm = UtfNormal::NFC( $char );
00165                                         $this->assertEquals(
00166                                                 bin2hex( $norm ),
00167                                                 bin2hex( $clean ),
00168                                                 "Pair $x should be intact" );
00169                                         if ( $norm != $clean ) {
00170                                                 return;
00171                                         }
00172                                 } elseif ( $first > 0xfd || $second > 0xbf ) {
00173                                         # fe and ff are not legal head bytes -- expect two replacement chars
00174                                         $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
00175                                         $this->assertEquals(
00176                                                 bin2hex( $norm ),
00177                                                 bin2hex( $clean ),
00178                                                 "Forbidden pair $x should be rejected" );
00179                                         if ( $norm != $clean ) {
00180                                                 return;
00181                                         }
00182                                 } else {
00183                                         $norm = $head . UTF8_REPLACEMENT . $tail;
00184                                         $this->assertEquals(
00185                                                 bin2hex( $norm ),
00186                                                 bin2hex( $clean ),
00187                                                 "Forbidden pair $x should be rejected" );
00188                                         if ( $norm != $clean ) {
00189                                                 return;
00190                                         }
00191                                 }
00192                         }
00193                 }
00194         }
00195 
00197         function testTripleBytes() {
00198                 $this->doTestTripleBytes( '', '' );
00199                 $this->doTestTripleBytes( 'x', '' );
00200                 $this->doTestTripleBytes( '', 'x' );
00201                 $this->doTestTripleBytes( 'x', 'x' );
00202         }
00203 
00205         function doTestTripleBytes( $head, $tail ) {
00206                 for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
00207                         for ( $second = 0x80; $second < 0x100; $second += 2 ) {
00208                                 #for( $third = 0x80; $third < 0x100; $third++ ) {
00209                                 for ( $third = 0x80; $third < 0x81; $third++ ) {
00210                                         $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
00211                                         $clean = UtfNormal::cleanUp( $char );
00212                                         $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
00213 
00214                                         if ( $first >= 0xe0 &&
00215                                                 $first < 0xf0 &&
00216                                                 $second < 0xc0 &&
00217                                                 $third < 0xc0
00218                                         ) {
00219                                                 if ( $first == 0xe0 && $second < 0xa0 ) {
00220                                                         $this->assertEquals(
00221                                                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00222                                                                 bin2hex( $clean ),
00223                                                                 "Overlong triplet $x should be rejected" );
00224                                                 } elseif ( $first == 0xed &&
00225                                                         ( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST
00226                                                 ) {
00227                                                         $this->assertEquals(
00228                                                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00229                                                                 bin2hex( $clean ),
00230                                                                 "Surrogate triplet $x should be rejected" );
00231                                                 } else {
00232                                                         $this->assertEquals(
00233                                                                 bin2hex( UtfNormal::NFC( $char ) ),
00234                                                                 bin2hex( $clean ),
00235                                                                 "Triplet $x should be intact" );
00236                                                 }
00237                                         } elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
00238                                                 $this->assertEquals(
00239                                                         bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
00240                                                         bin2hex( $clean ),
00241                                                         "Valid 2-byte $x + broken tail" );
00242                                         } elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
00243                                                 $this->assertEquals(
00244                                                         bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
00245                                                         bin2hex( $clean ),
00246                                                         "Broken head + valid 2-byte $x" );
00247                                         } elseif ( ( $first > 0xfd || $second > 0xfd ) &&
00248                                                 ( ( $second > 0xbf && $third > 0xbf ) ||
00249                                                         ( $second < 0xc0 && $third < 0xc0 ) ||
00250                                                         ( $second > 0xfd ) ||
00251                                                         ( $third > 0xfd ) )
00252                                         ) {
00253                                                 # fe and ff are not legal head bytes -- expect three replacement chars
00254                                                 $this->assertEquals(
00255                                                         bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
00256                                                         bin2hex( $clean ),
00257                                                         "Forbidden triplet $x should be rejected" );
00258                                         } elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
00259                                                 $this->assertEquals(
00260                                                         bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00261                                                         bin2hex( $clean ),
00262                                                         "Forbidden triplet $x should be rejected" );
00263                                         } else {
00264                                                 $this->assertEquals(
00265                                                         bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
00266                                                         bin2hex( $clean ),
00267                                                         "Forbidden triplet $x should be rejected" );
00268                                         }
00269                                 }
00270                         }
00271                 }
00272         }
00273 
00275         function testChunkRegression() {
00276                 # Check for regression against a chunking bug
00277                 $text = "\x46\x55\xb8" .
00278                         "\xdc\x96" .
00279                         "\xee" .
00280                         "\xe7" .
00281                         "\x44" .
00282                         "\xaa" .
00283                         "\x2f\x25";
00284                 $expect = "\x46\x55\xef\xbf\xbd" .
00285                         "\xdc\x96" .
00286                         "\xef\xbf\xbd" .
00287                         "\xef\xbf\xbd" .
00288                         "\x44" .
00289                         "\xef\xbf\xbd" .
00290                         "\x2f\x25";
00291 
00292                 $this->assertEquals(
00293                         bin2hex( $expect ),
00294                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00295         }
00296 
00298         function testInterposeRegression() {
00299                 $text = "\x4e\x30" .
00300                         "\xb1" . # bad tail
00301                         "\x3a" .
00302                         "\x92" . # bad tail
00303                         "\x62\x3a" .
00304                         "\x84" . # bad tail
00305                         "\x43" .
00306                         "\xc6" . # bad head
00307                         "\x3f" .
00308                         "\x92" . # bad tail
00309                         "\xad" . # bad tail
00310                         "\x7d" .
00311                         "\xd9\x95";
00312 
00313                 $expect = "\x4e\x30" .
00314                         "\xef\xbf\xbd" .
00315                         "\x3a" .
00316                         "\xef\xbf\xbd" .
00317                         "\x62\x3a" .
00318                         "\xef\xbf\xbd" .
00319                         "\x43" .
00320                         "\xef\xbf\xbd" .
00321                         "\x3f" .
00322                         "\xef\xbf\xbd" .
00323                         "\xef\xbf\xbd" .
00324                         "\x7d" .
00325                         "\xd9\x95";
00326 
00327                 $this->assertEquals(
00328                         bin2hex( $expect ),
00329                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00330         }
00331 
00333         function testOverlongRegression() {
00334                 $text = "\x67" .
00335                         "\x1a" . # forbidden ascii
00336                         "\xea" . # bad head
00337                         "\xc1\xa6" . # overlong sequence
00338                         "\xad" . # bad tail
00339                         "\x1c" . # forbidden ascii
00340                         "\xb0" . # bad tail
00341                         "\x3c" .
00342                         "\x9e"; # bad tail
00343                 $expect = "\x67" .
00344                         "\xef\xbf\xbd" .
00345                         "\xef\xbf\xbd" .
00346                         "\xef\xbf\xbd" .
00347                         "\xef\xbf\xbd" .
00348                         "\xef\xbf\xbd" .
00349                         "\xef\xbf\xbd" .
00350                         "\x3c" .
00351                         "\xef\xbf\xbd";
00352                 $this->assertEquals(
00353                         bin2hex( $expect ),
00354                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00355         }
00356 
00358         function testSurrogateRegression() {
00359                 $text = "\xed\xb4\x96" . # surrogate 0xDD16
00360                         "\x83" . # bad tail
00361                         "\xb4" . # bad tail
00362                         "\xac"; # bad head
00363                 $expect = "\xef\xbf\xbd" .
00364                         "\xef\xbf\xbd" .
00365                         "\xef\xbf\xbd" .
00366                         "\xef\xbf\xbd";
00367                 $this->assertEquals(
00368                         bin2hex( $expect ),
00369                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00370         }
00371 
00373         function testBomRegression() {
00374                 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
00375                         "\xb2" . # bad tail
00376                         "\xef" . # bad head
00377                         "\x59";
00378                 $expect = "\xef\xbf\xbd" .
00379                         "\xef\xbf\xbd" .
00380                         "\xef\xbf\xbd" .
00381                         "\x59";
00382                 $this->assertEquals(
00383                         bin2hex( $expect ),
00384                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00385         }
00386 
00388         function testForbiddenRegression() {
00389                 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
00390                 $expect = "\xef\xbf\xbd";
00391                 $this->assertEquals(
00392                         bin2hex( $expect ),
00393                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00394         }
00395 
00397         function testHangulRegression() {
00398                 $text = "\xed\x9c\xaf" . # Hangul char
00399                         "\xe1\x87\x81"; # followed by another final jamo
00400                 $expect = $text; # Should *not* change.
00401                 $this->assertEquals(
00402                         bin2hex( $expect ),
00403                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00404         }
00405 }