MediaWiki
REL1_21
|
00001 <?php 00034 class CleanUpTest extends MediaWikiTestCase { 00036 function testAscii() { 00037 $text = 'This is plain ASCII text.'; 00038 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) ); 00039 } 00040 00042 function testNull() { 00043 $text = "a \x00 null"; 00044 $expect = "a \xef\xbf\xbd null"; 00045 $this->assertEquals( 00046 bin2hex( $expect ), 00047 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00048 } 00049 00051 function testLatin() { 00052 $text = "L'\xc3\xa9cole"; 00053 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) ); 00054 } 00055 00057 function testLatinNormal() { 00058 $text = "L'e\xcc\x81cole"; 00059 $expect = "L'\xc3\xa9cole"; 00060 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) ); 00061 } 00062 00067 function XtestAllChars() { 00068 $rep = UTF8_REPLACEMENT; 00069 for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) { 00070 $char = codepointToUtf8( $i ); 00071 $clean = UtfNormal::cleanUp( $char ); 00072 $x = sprintf( "%04X", $i ); 00073 00074 if ( $i % 0x1000 == 0 ) { 00075 echo "U+$x\n"; 00076 } 00077 00078 if ( $i == 0x0009 || 00079 $i == 0x000a || 00080 $i == 0x000d || 00081 ( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) || 00082 ( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) || 00083 ( $i > 0xffff && $i <= UNICODE_MAX ) 00084 ) { 00085 if ( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) { 00086 $comp = UtfNormal::NFC( $char ); 00087 $this->assertEquals( 00088 bin2hex( $comp ), 00089 bin2hex( $clean ), 00090 "U+$x should be decomposed" ); 00091 } else { 00092 $this->assertEquals( 00093 bin2hex( $char ), 00094 bin2hex( $clean ), 00095 "U+$x should be intact" ); 00096 } 00097 } else { 00098 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x ); 00099 } 00100 } 00101 } 00102 00104 function testAllBytes() { 00105 $this->doTestBytes( '', '' ); 00106 $this->doTestBytes( 'x', '' ); 00107 $this->doTestBytes( '', 'x' ); 00108 $this->doTestBytes( 'x', 'x' ); 00109 } 00110 00112 function doTestBytes( $head, $tail ) { 00113 for ( $i = 0x0; $i < 256; $i++ ) { 00114 $char = $head . chr( $i ) . $tail; 00115 $clean = UtfNormal::cleanUp( $char ); 00116 $x = sprintf( "%02X", $i ); 00117 00118 if ( $i == 0x0009 || 00119 $i == 0x000a || 00120 $i == 0x000d || 00121 ( $i > 0x001f && $i < 0x80 ) 00122 ) { 00123 $this->assertEquals( 00124 bin2hex( $char ), 00125 bin2hex( $clean ), 00126 "ASCII byte $x should be intact" ); 00127 if ( $char != $clean ) { 00128 return; 00129 } 00130 } else { 00131 $norm = $head . UTF8_REPLACEMENT . $tail; 00132 $this->assertEquals( 00133 bin2hex( $norm ), 00134 bin2hex( $clean ), 00135 "Forbidden byte $x should be rejected" ); 00136 if ( $norm != $clean ) { 00137 return; 00138 } 00139 } 00140 } 00141 } 00142 00144 function testDoubleBytes() { 00145 $this->doTestDoubleBytes( '', '' ); 00146 $this->doTestDoubleBytes( 'x', '' ); 00147 $this->doTestDoubleBytes( '', 'x' ); 00148 $this->doTestDoubleBytes( 'x', 'x' ); 00149 } 00150 00154 function doTestDoubleBytes( $head, $tail ) { 00155 for ( $first = 0xc0; $first < 0x100; $first += 2 ) { 00156 for ( $second = 0x80; $second < 0x100; $second += 2 ) { 00157 $char = $head . chr( $first ) . chr( $second ) . $tail; 00158 $clean = UtfNormal::cleanUp( $char ); 00159 $x = sprintf( "%02X,%02X", $first, $second ); 00160 if ( $first > 0xc1 && 00161 $first < 0xe0 && 00162 $second < 0xc0 00163 ) { 00164 $norm = UtfNormal::NFC( $char ); 00165 $this->assertEquals( 00166 bin2hex( $norm ), 00167 bin2hex( $clean ), 00168 "Pair $x should be intact" ); 00169 if ( $norm != $clean ) { 00170 return; 00171 } 00172 } elseif ( $first > 0xfd || $second > 0xbf ) { 00173 # fe and ff are not legal head bytes -- expect two replacement chars 00174 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail; 00175 $this->assertEquals( 00176 bin2hex( $norm ), 00177 bin2hex( $clean ), 00178 "Forbidden pair $x should be rejected" ); 00179 if ( $norm != $clean ) { 00180 return; 00181 } 00182 } else { 00183 $norm = $head . UTF8_REPLACEMENT . $tail; 00184 $this->assertEquals( 00185 bin2hex( $norm ), 00186 bin2hex( $clean ), 00187 "Forbidden pair $x should be rejected" ); 00188 if ( $norm != $clean ) { 00189 return; 00190 } 00191 } 00192 } 00193 } 00194 } 00195 00197 function testTripleBytes() { 00198 $this->doTestTripleBytes( '', '' ); 00199 $this->doTestTripleBytes( 'x', '' ); 00200 $this->doTestTripleBytes( '', 'x' ); 00201 $this->doTestTripleBytes( 'x', 'x' ); 00202 } 00203 00205 function doTestTripleBytes( $head, $tail ) { 00206 for ( $first = 0xc0; $first < 0x100; $first += 2 ) { 00207 for ( $second = 0x80; $second < 0x100; $second += 2 ) { 00208 #for( $third = 0x80; $third < 0x100; $third++ ) { 00209 for ( $third = 0x80; $third < 0x81; $third++ ) { 00210 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail; 00211 $clean = UtfNormal::cleanUp( $char ); 00212 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third ); 00213 00214 if ( $first >= 0xe0 && 00215 $first < 0xf0 && 00216 $second < 0xc0 && 00217 $third < 0xc0 00218 ) { 00219 if ( $first == 0xe0 && $second < 0xa0 ) { 00220 $this->assertEquals( 00221 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00222 bin2hex( $clean ), 00223 "Overlong triplet $x should be rejected" ); 00224 } elseif ( $first == 0xed && 00225 ( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST 00226 ) { 00227 $this->assertEquals( 00228 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00229 bin2hex( $clean ), 00230 "Surrogate triplet $x should be rejected" ); 00231 } else { 00232 $this->assertEquals( 00233 bin2hex( UtfNormal::NFC( $char ) ), 00234 bin2hex( $clean ), 00235 "Triplet $x should be intact" ); 00236 } 00237 } elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) { 00238 $this->assertEquals( 00239 bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ), 00240 bin2hex( $clean ), 00241 "Valid 2-byte $x + broken tail" ); 00242 } elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) { 00243 $this->assertEquals( 00244 bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ), 00245 bin2hex( $clean ), 00246 "Broken head + valid 2-byte $x" ); 00247 } elseif ( ( $first > 0xfd || $second > 0xfd ) && 00248 ( ( $second > 0xbf && $third > 0xbf ) || 00249 ( $second < 0xc0 && $third < 0xc0 ) || 00250 ( $second > 0xfd ) || 00251 ( $third > 0xfd ) ) 00252 ) { 00253 # fe and ff are not legal head bytes -- expect three replacement chars 00254 $this->assertEquals( 00255 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), 00256 bin2hex( $clean ), 00257 "Forbidden triplet $x should be rejected" ); 00258 } elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) { 00259 $this->assertEquals( 00260 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00261 bin2hex( $clean ), 00262 "Forbidden triplet $x should be rejected" ); 00263 } else { 00264 $this->assertEquals( 00265 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), 00266 bin2hex( $clean ), 00267 "Forbidden triplet $x should be rejected" ); 00268 } 00269 } 00270 } 00271 } 00272 } 00273 00275 function testChunkRegression() { 00276 # Check for regression against a chunking bug 00277 $text = "\x46\x55\xb8" . 00278 "\xdc\x96" . 00279 "\xee" . 00280 "\xe7" . 00281 "\x44" . 00282 "\xaa" . 00283 "\x2f\x25"; 00284 $expect = "\x46\x55\xef\xbf\xbd" . 00285 "\xdc\x96" . 00286 "\xef\xbf\xbd" . 00287 "\xef\xbf\xbd" . 00288 "\x44" . 00289 "\xef\xbf\xbd" . 00290 "\x2f\x25"; 00291 00292 $this->assertEquals( 00293 bin2hex( $expect ), 00294 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00295 } 00296 00298 function testInterposeRegression() { 00299 $text = "\x4e\x30" . 00300 "\xb1" . # bad tail 00301 "\x3a" . 00302 "\x92" . # bad tail 00303 "\x62\x3a" . 00304 "\x84" . # bad tail 00305 "\x43" . 00306 "\xc6" . # bad head 00307 "\x3f" . 00308 "\x92" . # bad tail 00309 "\xad" . # bad tail 00310 "\x7d" . 00311 "\xd9\x95"; 00312 00313 $expect = "\x4e\x30" . 00314 "\xef\xbf\xbd" . 00315 "\x3a" . 00316 "\xef\xbf\xbd" . 00317 "\x62\x3a" . 00318 "\xef\xbf\xbd" . 00319 "\x43" . 00320 "\xef\xbf\xbd" . 00321 "\x3f" . 00322 "\xef\xbf\xbd" . 00323 "\xef\xbf\xbd" . 00324 "\x7d" . 00325 "\xd9\x95"; 00326 00327 $this->assertEquals( 00328 bin2hex( $expect ), 00329 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00330 } 00331 00333 function testOverlongRegression() { 00334 $text = "\x67" . 00335 "\x1a" . # forbidden ascii 00336 "\xea" . # bad head 00337 "\xc1\xa6" . # overlong sequence 00338 "\xad" . # bad tail 00339 "\x1c" . # forbidden ascii 00340 "\xb0" . # bad tail 00341 "\x3c" . 00342 "\x9e"; # bad tail 00343 $expect = "\x67" . 00344 "\xef\xbf\xbd" . 00345 "\xef\xbf\xbd" . 00346 "\xef\xbf\xbd" . 00347 "\xef\xbf\xbd" . 00348 "\xef\xbf\xbd" . 00349 "\xef\xbf\xbd" . 00350 "\x3c" . 00351 "\xef\xbf\xbd"; 00352 $this->assertEquals( 00353 bin2hex( $expect ), 00354 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00355 } 00356 00358 function testSurrogateRegression() { 00359 $text = "\xed\xb4\x96" . # surrogate 0xDD16 00360 "\x83" . # bad tail 00361 "\xb4" . # bad tail 00362 "\xac"; # bad head 00363 $expect = "\xef\xbf\xbd" . 00364 "\xef\xbf\xbd" . 00365 "\xef\xbf\xbd" . 00366 "\xef\xbf\xbd"; 00367 $this->assertEquals( 00368 bin2hex( $expect ), 00369 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00370 } 00371 00373 function testBomRegression() { 00374 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char 00375 "\xb2" . # bad tail 00376 "\xef" . # bad head 00377 "\x59"; 00378 $expect = "\xef\xbf\xbd" . 00379 "\xef\xbf\xbd" . 00380 "\xef\xbf\xbd" . 00381 "\x59"; 00382 $this->assertEquals( 00383 bin2hex( $expect ), 00384 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00385 } 00386 00388 function testForbiddenRegression() { 00389 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char 00390 $expect = "\xef\xbf\xbd"; 00391 $this->assertEquals( 00392 bin2hex( $expect ), 00393 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00394 } 00395 00397 function testHangulRegression() { 00398 $text = "\xed\x9c\xaf" . # Hangul char 00399 "\xe1\x87\x81"; # followed by another final jamo 00400 $expect = $text; # Should *not* change. 00401 $this->assertEquals( 00402 bin2hex( $expect ), 00403 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00404 } 00405 }