MediaWiki
REL1_24
|
00001 <?php 00041 class CleanUpTest extends MediaWikiTestCase { 00043 public function testAscii() { 00044 $text = 'This is plain ASCII text.'; 00045 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) ); 00046 } 00047 00049 public function testNull() { 00050 $text = "a \x00 null"; 00051 $expect = "a \xef\xbf\xbd null"; 00052 $this->assertEquals( 00053 bin2hex( $expect ), 00054 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00055 } 00056 00058 public function testLatin() { 00059 $text = "L'\xc3\xa9cole"; 00060 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) ); 00061 } 00062 00064 public function testLatinNormal() { 00065 $text = "L'e\xcc\x81cole"; 00066 $expect = "L'\xc3\xa9cole"; 00067 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) ); 00068 } 00069 00074 function XtestAllChars() { 00075 $rep = UTF8_REPLACEMENT; 00076 for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) { 00077 $char = codepointToUtf8( $i ); 00078 $clean = UtfNormal::cleanUp( $char ); 00079 $x = sprintf( "%04X", $i ); 00080 00081 if ( $i % 0x1000 == 0 ) { 00082 echo "U+$x\n"; 00083 } 00084 00085 if ( $i == 0x0009 || 00086 $i == 0x000a || 00087 $i == 0x000d || 00088 ( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) || 00089 ( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) || 00090 ( $i > 0xffff && $i <= UNICODE_MAX ) 00091 ) { 00092 if ( isset( UtfNormal::$utfCanonicalComp[$char] ) 00093 || isset( UtfNormal::$utfCanonicalDecomp[$char] ) 00094 ) { 00095 $comp = UtfNormal::NFC( $char ); 00096 $this->assertEquals( 00097 bin2hex( $comp ), 00098 bin2hex( $clean ), 00099 "U+$x should be decomposed" ); 00100 } else { 00101 $this->assertEquals( 00102 bin2hex( $char ), 00103 bin2hex( $clean ), 00104 "U+$x should be intact" ); 00105 } 00106 } else { 00107 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x ); 00108 } 00109 } 00110 } 00111 00113 public static function provideAllBytes() { 00114 return array( 00115 array( '', '' ), 00116 array( 'x', '' ), 00117 array( '', 'x' ), 00118 array( 'x', 'x' ), 00119 ); 00120 } 00121 00126 function testBytes( $head, $tail ) { 00127 for ( $i = 0x0; $i < 256; $i++ ) { 00128 $char = $head . chr( $i ) . $tail; 00129 $clean = UtfNormal::cleanUp( $char ); 00130 $x = sprintf( "%02X", $i ); 00131 00132 if ( $i == 0x0009 || 00133 $i == 0x000a || 00134 $i == 0x000d || 00135 ( $i > 0x001f && $i < 0x80 ) 00136 ) { 00137 $this->assertEquals( 00138 bin2hex( $char ), 00139 bin2hex( $clean ), 00140 "ASCII byte $x should be intact" ); 00141 if ( $char != $clean ) { 00142 return; 00143 } 00144 } else { 00145 $norm = $head . UTF8_REPLACEMENT . $tail; 00146 $this->assertEquals( 00147 bin2hex( $norm ), 00148 bin2hex( $clean ), 00149 "Forbidden byte $x should be rejected" ); 00150 if ( $norm != $clean ) { 00151 return; 00152 } 00153 } 00154 } 00155 } 00156 00161 function testDoubleBytes( $head, $tail ) { 00162 for ( $first = 0xc0; $first < 0x100; $first += 2 ) { 00163 for ( $second = 0x80; $second < 0x100; $second += 2 ) { 00164 $char = $head . chr( $first ) . chr( $second ) . $tail; 00165 $clean = UtfNormal::cleanUp( $char ); 00166 $x = sprintf( "%02X,%02X", $first, $second ); 00167 if ( $first > 0xc1 && 00168 $first < 0xe0 && 00169 $second < 0xc0 00170 ) { 00171 $norm = UtfNormal::NFC( $char ); 00172 $this->assertEquals( 00173 bin2hex( $norm ), 00174 bin2hex( $clean ), 00175 "Pair $x should be intact" ); 00176 if ( $norm != $clean ) { 00177 return; 00178 } 00179 } elseif ( $first > 0xfd || $second > 0xbf ) { 00180 # fe and ff are not legal head bytes -- expect two replacement chars 00181 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail; 00182 $this->assertEquals( 00183 bin2hex( $norm ), 00184 bin2hex( $clean ), 00185 "Forbidden pair $x should be rejected" ); 00186 if ( $norm != $clean ) { 00187 return; 00188 } 00189 } else { 00190 $norm = $head . UTF8_REPLACEMENT . $tail; 00191 $this->assertEquals( 00192 bin2hex( $norm ), 00193 bin2hex( $clean ), 00194 "Forbidden pair $x should be rejected" ); 00195 if ( $norm != $clean ) { 00196 return; 00197 } 00198 } 00199 } 00200 } 00201 } 00202 00207 function testTripleBytes( $head, $tail ) { 00208 for ( $first = 0xc0; $first < 0x100; $first += 2 ) { 00209 for ( $second = 0x80; $second < 0x100; $second += 2 ) { 00210 #for( $third = 0x80; $third < 0x100; $third++ ) { 00211 for ( $third = 0x80; $third < 0x81; $third++ ) { 00212 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail; 00213 $clean = UtfNormal::cleanUp( $char ); 00214 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third ); 00215 00216 if ( $first >= 0xe0 && 00217 $first < 0xf0 && 00218 $second < 0xc0 && 00219 $third < 0xc0 00220 ) { 00221 if ( $first == 0xe0 && $second < 0xa0 ) { 00222 $this->assertEquals( 00223 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00224 bin2hex( $clean ), 00225 "Overlong triplet $x should be rejected" ); 00226 } elseif ( $first == 0xed && 00227 ( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST 00228 ) { 00229 $this->assertEquals( 00230 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00231 bin2hex( $clean ), 00232 "Surrogate triplet $x should be rejected" ); 00233 } else { 00234 $this->assertEquals( 00235 bin2hex( UtfNormal::NFC( $char ) ), 00236 bin2hex( $clean ), 00237 "Triplet $x should be intact" ); 00238 } 00239 } elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) { 00240 $this->assertEquals( 00241 bin2hex( UtfNormal::NFC( $head . chr( $first ) . 00242 chr( $second ) ) . UTF8_REPLACEMENT . $tail ), 00243 bin2hex( $clean ), 00244 "Valid 2-byte $x + broken tail" ); 00245 } elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) { 00246 $this->assertEquals( 00247 bin2hex( $head . UTF8_REPLACEMENT . 00248 UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ), 00249 bin2hex( $clean ), 00250 "Broken head + valid 2-byte $x" ); 00251 } elseif ( ( $first > 0xfd || $second > 0xfd ) && 00252 ( ( $second > 0xbf && $third > 0xbf ) || 00253 ( $second < 0xc0 && $third < 0xc0 ) || 00254 ( $second > 0xfd ) || 00255 ( $third > 0xfd ) ) 00256 ) { 00257 # fe and ff are not legal head bytes -- expect three replacement chars 00258 $this->assertEquals( 00259 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), 00260 bin2hex( $clean ), 00261 "Forbidden triplet $x should be rejected" ); 00262 } elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) { 00263 $this->assertEquals( 00264 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00265 bin2hex( $clean ), 00266 "Forbidden triplet $x should be rejected" ); 00267 } else { 00268 $this->assertEquals( 00269 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), 00270 bin2hex( $clean ), 00271 "Forbidden triplet $x should be rejected" ); 00272 } 00273 } 00274 } 00275 } 00276 } 00277 00279 public function testChunkRegression() { 00280 # Check for regression against a chunking bug 00281 $text = "\x46\x55\xb8" . 00282 "\xdc\x96" . 00283 "\xee" . 00284 "\xe7" . 00285 "\x44" . 00286 "\xaa" . 00287 "\x2f\x25"; 00288 $expect = "\x46\x55\xef\xbf\xbd" . 00289 "\xdc\x96" . 00290 "\xef\xbf\xbd" . 00291 "\xef\xbf\xbd" . 00292 "\x44" . 00293 "\xef\xbf\xbd" . 00294 "\x2f\x25"; 00295 00296 $this->assertEquals( 00297 bin2hex( $expect ), 00298 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00299 } 00300 00302 public function testInterposeRegression() { 00303 $text = "\x4e\x30" . 00304 "\xb1" . # bad tail 00305 "\x3a" . 00306 "\x92" . # bad tail 00307 "\x62\x3a" . 00308 "\x84" . # bad tail 00309 "\x43" . 00310 "\xc6" . # bad head 00311 "\x3f" . 00312 "\x92" . # bad tail 00313 "\xad" . # bad tail 00314 "\x7d" . 00315 "\xd9\x95"; 00316 00317 $expect = "\x4e\x30" . 00318 "\xef\xbf\xbd" . 00319 "\x3a" . 00320 "\xef\xbf\xbd" . 00321 "\x62\x3a" . 00322 "\xef\xbf\xbd" . 00323 "\x43" . 00324 "\xef\xbf\xbd" . 00325 "\x3f" . 00326 "\xef\xbf\xbd" . 00327 "\xef\xbf\xbd" . 00328 "\x7d" . 00329 "\xd9\x95"; 00330 00331 $this->assertEquals( 00332 bin2hex( $expect ), 00333 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00334 } 00335 00337 public function testOverlongRegression() { 00338 $text = "\x67" . 00339 "\x1a" . # forbidden ascii 00340 "\xea" . # bad head 00341 "\xc1\xa6" . # overlong sequence 00342 "\xad" . # bad tail 00343 "\x1c" . # forbidden ascii 00344 "\xb0" . # bad tail 00345 "\x3c" . 00346 "\x9e"; # bad tail 00347 $expect = "\x67" . 00348 "\xef\xbf\xbd" . 00349 "\xef\xbf\xbd" . 00350 "\xef\xbf\xbd" . 00351 "\xef\xbf\xbd" . 00352 "\xef\xbf\xbd" . 00353 "\xef\xbf\xbd" . 00354 "\x3c" . 00355 "\xef\xbf\xbd"; 00356 $this->assertEquals( 00357 bin2hex( $expect ), 00358 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00359 } 00360 00362 public function testSurrogateRegression() { 00363 $text = "\xed\xb4\x96" . # surrogate 0xDD16 00364 "\x83" . # bad tail 00365 "\xb4" . # bad tail 00366 "\xac"; # bad head 00367 $expect = "\xef\xbf\xbd" . 00368 "\xef\xbf\xbd" . 00369 "\xef\xbf\xbd" . 00370 "\xef\xbf\xbd"; 00371 $this->assertEquals( 00372 bin2hex( $expect ), 00373 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00374 } 00375 00377 public function testBomRegression() { 00378 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char 00379 "\xb2" . # bad tail 00380 "\xef" . # bad head 00381 "\x59"; 00382 $expect = "\xef\xbf\xbd" . 00383 "\xef\xbf\xbd" . 00384 "\xef\xbf\xbd" . 00385 "\x59"; 00386 $this->assertEquals( 00387 bin2hex( $expect ), 00388 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00389 } 00390 00392 public function testForbiddenRegression() { 00393 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char 00394 $expect = "\xef\xbf\xbd"; 00395 $this->assertEquals( 00396 bin2hex( $expect ), 00397 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00398 } 00399 00401 public function testHangulRegression() { 00402 $text = "\xed\x9c\xaf" . # Hangul char 00403 "\xe1\x87\x81"; # followed by another final jamo 00404 $expect = $text; # Should *not* change. 00405 $this->assertEquals( 00406 bin2hex( $expect ), 00407 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00408 } 00409 }