MediaWiki  REL1_19
CleanUpTest.php
Go to the documentation of this file.
00001 <?php
00033 class CleanUpTest extends MediaWikiTestCase {
00035         function testAscii() {
00036                 $text = 'This is plain ASCII text.';
00037                 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
00038         }
00039 
00041         function testNull() {
00042                 $text = "a \x00 null";
00043                 $expect = "a \xef\xbf\xbd null";
00044                 $this->assertEquals(
00045                         bin2hex( $expect ),
00046                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00047         }
00048 
00050         function testLatin() {
00051                 $text = "L'\xc3\xa9cole";
00052                 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
00053         }
00054 
00056         function testLatinNormal() {
00057                 $text = "L'e\xcc\x81cole";
00058                 $expect = "L'\xc3\xa9cole";
00059                 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
00060         }
00061 
00066         function XtestAllChars() {
00067                 $rep = UTF8_REPLACEMENT;
00068                 for( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
00069                         $char = codepointToUtf8( $i );
00070                         $clean = UtfNormal::cleanUp( $char );
00071                         $x = sprintf( "%04X", $i );
00072                         if( $i % 0x1000 == 0 ) echo "U+$x\n";
00073                         if( $i == 0x0009 ||
00074                             $i == 0x000a ||
00075                             $i == 0x000d ||
00076                             ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
00077                             ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
00078                             ($i > 0xffff && $i <= UNICODE_MAX ) ) {
00079                                 if( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) {
00080                                     $comp = UtfNormal::NFC( $char );
00081                                         $this->assertEquals(
00082                                                 bin2hex( $comp ),
00083                                                 bin2hex( $clean ),
00084                                                 "U+$x should be decomposed" );
00085                                 } else {
00086                                         $this->assertEquals(
00087                                                 bin2hex( $char ),
00088                                                 bin2hex( $clean ),
00089                                                 "U+$x should be intact" );
00090                                 }
00091                         } else {
00092                                 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
00093                         }
00094                 }
00095         }
00096 
00098         function testAllBytes() {
00099                 $this->doTestBytes( '', '' );
00100                 $this->doTestBytes( 'x', '' );
00101                 $this->doTestBytes( '', 'x' );
00102                 $this->doTestBytes( 'x', 'x' );
00103         }
00104 
00106         function doTestBytes( $head, $tail ) {
00107                 for( $i = 0x0; $i < 256; $i++ ) {
00108                         $char = $head . chr( $i ) . $tail;
00109                         $clean = UtfNormal::cleanUp( $char );
00110                         $x = sprintf( "%02X", $i );
00111                         if( $i == 0x0009 ||
00112                             $i == 0x000a ||
00113                             $i == 0x000d ||
00114                             ($i > 0x001f && $i < 0x80) ) {
00115                                 $this->assertEquals(
00116                                         bin2hex( $char ),
00117                                         bin2hex( $clean ),
00118                                         "ASCII byte $x should be intact" );
00119                                 if( $char != $clean ) return;
00120                         } else {
00121                                 $norm = $head . UTF8_REPLACEMENT . $tail;
00122                                 $this->assertEquals(
00123                                         bin2hex( $norm ),
00124                                         bin2hex( $clean ),
00125                                         "Forbidden byte $x should be rejected" );
00126                                 if( $norm != $clean ) return;
00127                         }
00128                 }
00129         }
00130 
00132         function testDoubleBytes() {
00133                 $this->doTestDoubleBytes( '', '' );
00134                 $this->doTestDoubleBytes( 'x', '' );
00135                 $this->doTestDoubleBytes( '', 'x' );
00136                 $this->doTestDoubleBytes( 'x', 'x' );
00137         }
00138 
00142         function doTestDoubleBytes( $head, $tail ) {
00143                 for( $first = 0xc0; $first < 0x100; $first+=2 ) {
00144                         for( $second = 0x80; $second < 0x100; $second+=2 ) {
00145                                 $char = $head . chr( $first ) . chr( $second ) . $tail;
00146                                 $clean = UtfNormal::cleanUp( $char );
00147                                 $x = sprintf( "%02X,%02X", $first, $second );
00148                                 if( $first > 0xc1 &&
00149                                     $first < 0xe0 &&
00150                                     $second < 0xc0 ) {
00151                                     $norm = UtfNormal::NFC( $char );
00152                                         $this->assertEquals(
00153                                                 bin2hex( $norm ),
00154                                                 bin2hex( $clean ),
00155                                                 "Pair $x should be intact" );
00156                                     if( $norm != $clean ) return;
00157                                 } elseif( $first > 0xfd || $second > 0xbf ) {
00158                                         # fe and ff are not legal head bytes -- expect two replacement chars
00159                                         $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
00160                                         $this->assertEquals(
00161                                                 bin2hex( $norm ),
00162                                                 bin2hex( $clean ),
00163                                                 "Forbidden pair $x should be rejected" );
00164                                         if( $norm != $clean ) return;
00165                                 } else {
00166                                         $norm = $head . UTF8_REPLACEMENT . $tail;
00167                                         $this->assertEquals(
00168                                                 bin2hex( $norm ),
00169                                                 bin2hex( $clean ),
00170                                                 "Forbidden pair $x should be rejected" );
00171                                         if( $norm != $clean ) return;
00172                                 }
00173                         }
00174                 }
00175         }
00176 
00178         function testTripleBytes() {
00179                 $this->doTestTripleBytes( '', '' );
00180                 $this->doTestTripleBytes( 'x', '' );
00181                 $this->doTestTripleBytes( '', 'x' );
00182                 $this->doTestTripleBytes( 'x', 'x' );
00183         }
00184 
00186         function doTestTripleBytes( $head, $tail ) {
00187                 for( $first = 0xc0; $first < 0x100; $first+=2 ) {
00188                         for( $second = 0x80; $second < 0x100; $second+=2 ) {
00189                                 #for( $third = 0x80; $third < 0x100; $third++ ) {
00190                                 for( $third = 0x80; $third < 0x81; $third++ ) {
00191                                         $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
00192                                         $clean = UtfNormal::cleanUp( $char );
00193                                         $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
00194                                         if( $first >= 0xe0 &&
00195                                                 $first < 0xf0 &&
00196                                                 $second < 0xc0 &&
00197                                                 $third < 0xc0 ) {
00198                                                 if( $first == 0xe0 && $second < 0xa0 ) {
00199                                                         $this->assertEquals(
00200                                                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00201                                                                 bin2hex( $clean ),
00202                                                                 "Overlong triplet $x should be rejected" );
00203                                                 } elseif( $first == 0xed &&
00204                                                         ( chr( $first ) . chr( $second ) . chr( $third ))  >= UTF8_SURROGATE_FIRST ) {
00205                                                         $this->assertEquals(
00206                                                                 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00207                                                                 bin2hex( $clean ),
00208                                                                 "Surrogate triplet $x should be rejected" );
00209                                                 } else {
00210                                                         $this->assertEquals(
00211                                                                 bin2hex( UtfNormal::NFC( $char ) ),
00212                                                                 bin2hex( $clean ),
00213                                                                 "Triplet $x should be intact" );
00214                                                 }
00215                                         } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
00216                                                 $this->assertEquals(
00217                                                         bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
00218                                                         bin2hex( $clean ),
00219                                                         "Valid 2-byte $x + broken tail" );
00220                                         } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
00221                                                 $this->assertEquals(
00222                                                         bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
00223                                                         bin2hex( $clean ),
00224                                                         "Broken head + valid 2-byte $x" );
00225                                         } elseif( ( $first > 0xfd || $second > 0xfd ) &&
00226                                                     ( ( $second > 0xbf && $third > 0xbf ) ||
00227                                                       ( $second < 0xc0 && $third < 0xc0 ) ||
00228                                                       ( $second > 0xfd ) ||
00229                                                       ( $third > 0xfd ) ) ) {
00230                                                 # fe and ff are not legal head bytes -- expect three replacement chars
00231                                                 $this->assertEquals(
00232                                                         bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
00233                                                         bin2hex( $clean ),
00234                                                         "Forbidden triplet $x should be rejected" );
00235                                         } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
00236                                                 $this->assertEquals(
00237                                                         bin2hex( $head . UTF8_REPLACEMENT . $tail ),
00238                                                         bin2hex( $clean ),
00239                                                         "Forbidden triplet $x should be rejected" );
00240                                         } else {
00241                                                 $this->assertEquals(
00242                                                         bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
00243                                                         bin2hex( $clean ),
00244                                                         "Forbidden triplet $x should be rejected" );
00245                                         }
00246                                 }
00247                         }
00248                 }
00249         }
00250 
00252         function testChunkRegression() {
00253                 # Check for regression against a chunking bug
00254                 $text   = "\x46\x55\xb8" .
00255                           "\xdc\x96" .
00256                           "\xee" .
00257                           "\xe7" .
00258                           "\x44" .
00259                           "\xaa" .
00260                           "\x2f\x25";
00261                 $expect = "\x46\x55\xef\xbf\xbd" .
00262                           "\xdc\x96" .
00263                           "\xef\xbf\xbd" .
00264                           "\xef\xbf\xbd" .
00265                           "\x44" .
00266                           "\xef\xbf\xbd" .
00267                           "\x2f\x25";
00268 
00269                 $this->assertEquals(
00270                         bin2hex( $expect ),
00271                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00272         }
00273 
00275         function testInterposeRegression() {
00276                 $text   = "\x4e\x30" .
00277                           "\xb1" .              # bad tail
00278                           "\x3a" .
00279                           "\x92" .              # bad tail
00280                           "\x62\x3a" .
00281                           "\x84" .              # bad tail
00282                           "\x43" .
00283                           "\xc6" .              # bad head
00284                           "\x3f" .
00285                           "\x92" .              # bad tail
00286                           "\xad" .              # bad tail
00287                           "\x7d" .
00288                           "\xd9\x95";
00289 
00290                 $expect = "\x4e\x30" .
00291                           "\xef\xbf\xbd" .
00292                           "\x3a" .
00293                           "\xef\xbf\xbd" .
00294                           "\x62\x3a" .
00295                           "\xef\xbf\xbd" .
00296                           "\x43" .
00297                           "\xef\xbf\xbd" .
00298                           "\x3f" .
00299                           "\xef\xbf\xbd" .
00300                           "\xef\xbf\xbd" .
00301                           "\x7d" .
00302                           "\xd9\x95";
00303 
00304                 $this->assertEquals(
00305                         bin2hex( $expect ),
00306                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00307         }
00308 
00310         function testOverlongRegression() {
00311                 $text   = "\x67" .
00312                           "\x1a" . # forbidden ascii
00313                           "\xea" . # bad head
00314                           "\xc1\xa6" . # overlong sequence
00315                           "\xad" . # bad tail
00316                           "\x1c" . # forbidden ascii
00317                           "\xb0" . # bad tail
00318                           "\x3c" .
00319                           "\x9e";  # bad tail
00320                 $expect = "\x67" .
00321                           "\xef\xbf\xbd" .
00322                           "\xef\xbf\xbd" .
00323                           "\xef\xbf\xbd" .
00324                           "\xef\xbf\xbd" .
00325                           "\xef\xbf\xbd" .
00326                           "\xef\xbf\xbd" .
00327                           "\x3c" .
00328                           "\xef\xbf\xbd";
00329                 $this->assertEquals(
00330                         bin2hex( $expect ),
00331                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00332         }
00333 
00335         function testSurrogateRegression() {
00336                 $text   = "\xed\xb4\x96" . # surrogate 0xDD16
00337                           "\x83" . # bad tail
00338                           "\xb4" . # bad tail
00339                           "\xac";  # bad head
00340                 $expect = "\xef\xbf\xbd" .
00341                           "\xef\xbf\xbd" .
00342                           "\xef\xbf\xbd" .
00343                           "\xef\xbf\xbd";
00344                 $this->assertEquals(
00345                         bin2hex( $expect ),
00346                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00347         }
00348 
00350         function testBomRegression() {
00351                 $text   = "\xef\xbf\xbe" . # U+FFFE, illegal char
00352                           "\xb2" . # bad tail
00353                           "\xef" . # bad head
00354                           "\x59";
00355                 $expect = "\xef\xbf\xbd" .
00356                           "\xef\xbf\xbd" .
00357                           "\xef\xbf\xbd" .
00358                           "\x59";
00359                 $this->assertEquals(
00360                         bin2hex( $expect ),
00361                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00362         }
00363 
00365         function testForbiddenRegression() {
00366                 $text   = "\xef\xbf\xbf"; # U+FFFF, illegal char
00367                 $expect = "\xef\xbf\xbd";
00368                 $this->assertEquals(
00369                         bin2hex( $expect ),
00370                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00371         }
00372 
00374         function testHangulRegression() {
00375                 $text = "\xed\x9c\xaf" . # Hangul char
00376                                 "\xe1\x87\x81";  # followed by another final jamo
00377                 $expect = $text;         # Should *not* change.
00378                 $this->assertEquals(
00379                         bin2hex( $expect ),
00380                         bin2hex( UtfNormal::cleanUp( $text ) ) );
00381         }
00382 }