MediaWiki
REL1_22
|
00001 <?php 00038 class CleanUpTest extends MediaWikiTestCase { 00040 public function testAscii() { 00041 $text = 'This is plain ASCII text.'; 00042 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) ); 00043 } 00044 00046 public function testNull() { 00047 $text = "a \x00 null"; 00048 $expect = "a \xef\xbf\xbd null"; 00049 $this->assertEquals( 00050 bin2hex( $expect ), 00051 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00052 } 00053 00055 public function testLatin() { 00056 $text = "L'\xc3\xa9cole"; 00057 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) ); 00058 } 00059 00061 public function testLatinNormal() { 00062 $text = "L'e\xcc\x81cole"; 00063 $expect = "L'\xc3\xa9cole"; 00064 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) ); 00065 } 00066 00071 function XtestAllChars() { 00072 $rep = UTF8_REPLACEMENT; 00073 for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) { 00074 $char = codepointToUtf8( $i ); 00075 $clean = UtfNormal::cleanUp( $char ); 00076 $x = sprintf( "%04X", $i ); 00077 00078 if ( $i % 0x1000 == 0 ) { 00079 echo "U+$x\n"; 00080 } 00081 00082 if ( $i == 0x0009 || 00083 $i == 0x000a || 00084 $i == 0x000d || 00085 ( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) || 00086 ( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) || 00087 ( $i > 0xffff && $i <= UNICODE_MAX ) 00088 ) { 00089 if ( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) { 00090 $comp = UtfNormal::NFC( $char ); 00091 $this->assertEquals( 00092 bin2hex( $comp ), 00093 bin2hex( $clean ), 00094 "U+$x should be decomposed" ); 00095 } else { 00096 $this->assertEquals( 00097 bin2hex( $char ), 00098 bin2hex( $clean ), 00099 "U+$x should be intact" ); 00100 } 00101 } else { 00102 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x ); 00103 } 00104 } 00105 } 00106 00108 public function testAllBytes() { 00109 $this->doTestBytes( '', '' ); 00110 $this->doTestBytes( 'x', '' ); 00111 $this->doTestBytes( '', 'x' ); 00112 $this->doTestBytes( 'x', 'x' ); 00113 } 00114 00116 function doTestBytes( $head, $tail ) { 00117 for ( $i = 0x0; $i < 256; $i++ ) { 00118 $char = $head . chr( $i ) . $tail; 00119 $clean = UtfNormal::cleanUp( $char ); 00120 $x = sprintf( "%02X", $i ); 00121 00122 if ( $i == 0x0009 || 00123 $i == 0x000a || 00124 $i == 0x000d || 00125 ( $i > 0x001f && $i < 0x80 ) 00126 ) { 00127 $this->assertEquals( 00128 bin2hex( $char ), 00129 bin2hex( $clean ), 00130 "ASCII byte $x should be intact" ); 00131 if ( $char != $clean ) { 00132 return; 00133 } 00134 } else { 00135 $norm = $head . UTF8_REPLACEMENT . $tail; 00136 $this->assertEquals( 00137 bin2hex( $norm ), 00138 bin2hex( $clean ), 00139 "Forbidden byte $x should be rejected" ); 00140 if ( $norm != $clean ) { 00141 return; 00142 } 00143 } 00144 } 00145 } 00146 00148 public function testDoubleBytes() { 00149 $this->doTestDoubleBytes( '', '' ); 00150 $this->doTestDoubleBytes( 'x', '' ); 00151 $this->doTestDoubleBytes( '', 'x' ); 00152 $this->doTestDoubleBytes( 'x', 'x' ); 00153 } 00154 00158 function doTestDoubleBytes( $head, $tail ) { 00159 for ( $first = 0xc0; $first < 0x100; $first += 2 ) { 00160 for ( $second = 0x80; $second < 0x100; $second += 2 ) { 00161 $char = $head . chr( $first ) . chr( $second ) . $tail; 00162 $clean = UtfNormal::cleanUp( $char ); 00163 $x = sprintf( "%02X,%02X", $first, $second ); 00164 if ( $first > 0xc1 && 00165 $first < 0xe0 && 00166 $second < 0xc0 00167 ) { 00168 $norm = UtfNormal::NFC( $char ); 00169 $this->assertEquals( 00170 bin2hex( $norm ), 00171 bin2hex( $clean ), 00172 "Pair $x should be intact" ); 00173 if ( $norm != $clean ) { 00174 return; 00175 } 00176 } elseif ( $first > 0xfd || $second > 0xbf ) { 00177 # fe and ff are not legal head bytes -- expect two replacement chars 00178 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail; 00179 $this->assertEquals( 00180 bin2hex( $norm ), 00181 bin2hex( $clean ), 00182 "Forbidden pair $x should be rejected" ); 00183 if ( $norm != $clean ) { 00184 return; 00185 } 00186 } else { 00187 $norm = $head . UTF8_REPLACEMENT . $tail; 00188 $this->assertEquals( 00189 bin2hex( $norm ), 00190 bin2hex( $clean ), 00191 "Forbidden pair $x should be rejected" ); 00192 if ( $norm != $clean ) { 00193 return; 00194 } 00195 } 00196 } 00197 } 00198 } 00199 00201 public function testTripleBytes() { 00202 $this->doTestTripleBytes( '', '' ); 00203 $this->doTestTripleBytes( 'x', '' ); 00204 $this->doTestTripleBytes( '', 'x' ); 00205 $this->doTestTripleBytes( 'x', 'x' ); 00206 } 00207 00209 function doTestTripleBytes( $head, $tail ) { 00210 for ( $first = 0xc0; $first < 0x100; $first += 2 ) { 00211 for ( $second = 0x80; $second < 0x100; $second += 2 ) { 00212 #for( $third = 0x80; $third < 0x100; $third++ ) { 00213 for ( $third = 0x80; $third < 0x81; $third++ ) { 00214 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail; 00215 $clean = UtfNormal::cleanUp( $char ); 00216 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third ); 00217 00218 if ( $first >= 0xe0 && 00219 $first < 0xf0 && 00220 $second < 0xc0 && 00221 $third < 0xc0 00222 ) { 00223 if ( $first == 0xe0 && $second < 0xa0 ) { 00224 $this->assertEquals( 00225 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00226 bin2hex( $clean ), 00227 "Overlong triplet $x should be rejected" ); 00228 } elseif ( $first == 0xed && 00229 ( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST 00230 ) { 00231 $this->assertEquals( 00232 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00233 bin2hex( $clean ), 00234 "Surrogate triplet $x should be rejected" ); 00235 } else { 00236 $this->assertEquals( 00237 bin2hex( UtfNormal::NFC( $char ) ), 00238 bin2hex( $clean ), 00239 "Triplet $x should be intact" ); 00240 } 00241 } elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) { 00242 $this->assertEquals( 00243 bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ), 00244 bin2hex( $clean ), 00245 "Valid 2-byte $x + broken tail" ); 00246 } elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) { 00247 $this->assertEquals( 00248 bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ), 00249 bin2hex( $clean ), 00250 "Broken head + valid 2-byte $x" ); 00251 } elseif ( ( $first > 0xfd || $second > 0xfd ) && 00252 ( ( $second > 0xbf && $third > 0xbf ) || 00253 ( $second < 0xc0 && $third < 0xc0 ) || 00254 ( $second > 0xfd ) || 00255 ( $third > 0xfd ) ) 00256 ) { 00257 # fe and ff are not legal head bytes -- expect three replacement chars 00258 $this->assertEquals( 00259 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), 00260 bin2hex( $clean ), 00261 "Forbidden triplet $x should be rejected" ); 00262 } elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) { 00263 $this->assertEquals( 00264 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00265 bin2hex( $clean ), 00266 "Forbidden triplet $x should be rejected" ); 00267 } else { 00268 $this->assertEquals( 00269 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), 00270 bin2hex( $clean ), 00271 "Forbidden triplet $x should be rejected" ); 00272 } 00273 } 00274 } 00275 } 00276 } 00277 00279 public function testChunkRegression() { 00280 # Check for regression against a chunking bug 00281 $text = "\x46\x55\xb8" . 00282 "\xdc\x96" . 00283 "\xee" . 00284 "\xe7" . 00285 "\x44" . 00286 "\xaa" . 00287 "\x2f\x25"; 00288 $expect = "\x46\x55\xef\xbf\xbd" . 00289 "\xdc\x96" . 00290 "\xef\xbf\xbd" . 00291 "\xef\xbf\xbd" . 00292 "\x44" . 00293 "\xef\xbf\xbd" . 00294 "\x2f\x25"; 00295 00296 $this->assertEquals( 00297 bin2hex( $expect ), 00298 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00299 } 00300 00302 public function testInterposeRegression() { 00303 $text = "\x4e\x30" . 00304 "\xb1" . # bad tail 00305 "\x3a" . 00306 "\x92" . # bad tail 00307 "\x62\x3a" . 00308 "\x84" . # bad tail 00309 "\x43" . 00310 "\xc6" . # bad head 00311 "\x3f" . 00312 "\x92" . # bad tail 00313 "\xad" . # bad tail 00314 "\x7d" . 00315 "\xd9\x95"; 00316 00317 $expect = "\x4e\x30" . 00318 "\xef\xbf\xbd" . 00319 "\x3a" . 00320 "\xef\xbf\xbd" . 00321 "\x62\x3a" . 00322 "\xef\xbf\xbd" . 00323 "\x43" . 00324 "\xef\xbf\xbd" . 00325 "\x3f" . 00326 "\xef\xbf\xbd" . 00327 "\xef\xbf\xbd" . 00328 "\x7d" . 00329 "\xd9\x95"; 00330 00331 $this->assertEquals( 00332 bin2hex( $expect ), 00333 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00334 } 00335 00337 public function testOverlongRegression() { 00338 $text = "\x67" . 00339 "\x1a" . # forbidden ascii 00340 "\xea" . # bad head 00341 "\xc1\xa6" . # overlong sequence 00342 "\xad" . # bad tail 00343 "\x1c" . # forbidden ascii 00344 "\xb0" . # bad tail 00345 "\x3c" . 00346 "\x9e"; # bad tail 00347 $expect = "\x67" . 00348 "\xef\xbf\xbd" . 00349 "\xef\xbf\xbd" . 00350 "\xef\xbf\xbd" . 00351 "\xef\xbf\xbd" . 00352 "\xef\xbf\xbd" . 00353 "\xef\xbf\xbd" . 00354 "\x3c" . 00355 "\xef\xbf\xbd"; 00356 $this->assertEquals( 00357 bin2hex( $expect ), 00358 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00359 } 00360 00362 public function testSurrogateRegression() { 00363 $text = "\xed\xb4\x96" . # surrogate 0xDD16 00364 "\x83" . # bad tail 00365 "\xb4" . # bad tail 00366 "\xac"; # bad head 00367 $expect = "\xef\xbf\xbd" . 00368 "\xef\xbf\xbd" . 00369 "\xef\xbf\xbd" . 00370 "\xef\xbf\xbd"; 00371 $this->assertEquals( 00372 bin2hex( $expect ), 00373 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00374 } 00375 00377 public function testBomRegression() { 00378 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char 00379 "\xb2" . # bad tail 00380 "\xef" . # bad head 00381 "\x59"; 00382 $expect = "\xef\xbf\xbd" . 00383 "\xef\xbf\xbd" . 00384 "\xef\xbf\xbd" . 00385 "\x59"; 00386 $this->assertEquals( 00387 bin2hex( $expect ), 00388 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00389 } 00390 00392 public function testForbiddenRegression() { 00393 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char 00394 $expect = "\xef\xbf\xbd"; 00395 $this->assertEquals( 00396 bin2hex( $expect ), 00397 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00398 } 00399 00401 public function testHangulRegression() { 00402 $text = "\xed\x9c\xaf" . # Hangul char 00403 "\xe1\x87\x81"; # followed by another final jamo 00404 $expect = $text; # Should *not* change. 00405 $this->assertEquals( 00406 bin2hex( $expect ), 00407 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00408 } 00409 }