MediaWiki
REL1_23
|
00001 <?php 00041 class CleanUpTest extends MediaWikiTestCase { 00043 public function testAscii() { 00044 $text = 'This is plain ASCII text.'; 00045 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) ); 00046 } 00047 00049 public function testNull() { 00050 $text = "a \x00 null"; 00051 $expect = "a \xef\xbf\xbd null"; 00052 $this->assertEquals( 00053 bin2hex( $expect ), 00054 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00055 } 00056 00058 public function testLatin() { 00059 $text = "L'\xc3\xa9cole"; 00060 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) ); 00061 } 00062 00064 public function testLatinNormal() { 00065 $text = "L'e\xcc\x81cole"; 00066 $expect = "L'\xc3\xa9cole"; 00067 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) ); 00068 } 00069 00074 function XtestAllChars() { 00075 $rep = UTF8_REPLACEMENT; 00076 for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) { 00077 $char = codepointToUtf8( $i ); 00078 $clean = UtfNormal::cleanUp( $char ); 00079 $x = sprintf( "%04X", $i ); 00080 00081 if ( $i % 0x1000 == 0 ) { 00082 echo "U+$x\n"; 00083 } 00084 00085 if ( $i == 0x0009 || 00086 $i == 0x000a || 00087 $i == 0x000d || 00088 ( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) || 00089 ( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) || 00090 ( $i > 0xffff && $i <= UNICODE_MAX ) 00091 ) { 00092 if ( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) { 00093 $comp = UtfNormal::NFC( $char ); 00094 $this->assertEquals( 00095 bin2hex( $comp ), 00096 bin2hex( $clean ), 00097 "U+$x should be decomposed" ); 00098 } else { 00099 $this->assertEquals( 00100 bin2hex( $char ), 00101 bin2hex( $clean ), 00102 "U+$x should be intact" ); 00103 } 00104 } else { 00105 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x ); 00106 } 00107 } 00108 } 00109 00111 public function testAllBytes() { 00112 $this->doTestBytes( '', '' ); 00113 $this->doTestBytes( 'x', '' ); 00114 $this->doTestBytes( '', 'x' ); 00115 $this->doTestBytes( 'x', 'x' ); 00116 } 00117 00119 function doTestBytes( $head, $tail ) { 00120 for ( $i = 0x0; $i < 256; $i++ ) { 00121 $char = $head . chr( $i ) . $tail; 00122 $clean = UtfNormal::cleanUp( $char ); 00123 $x = sprintf( "%02X", $i ); 00124 00125 if ( $i == 0x0009 || 00126 $i == 0x000a || 00127 $i == 0x000d || 00128 ( $i > 0x001f && $i < 0x80 ) 00129 ) { 00130 $this->assertEquals( 00131 bin2hex( $char ), 00132 bin2hex( $clean ), 00133 "ASCII byte $x should be intact" ); 00134 if ( $char != $clean ) { 00135 return; 00136 } 00137 } else { 00138 $norm = $head . UTF8_REPLACEMENT . $tail; 00139 $this->assertEquals( 00140 bin2hex( $norm ), 00141 bin2hex( $clean ), 00142 "Forbidden byte $x should be rejected" ); 00143 if ( $norm != $clean ) { 00144 return; 00145 } 00146 } 00147 } 00148 } 00149 00151 public function testDoubleBytes() { 00152 $this->doTestDoubleBytes( '', '' ); 00153 $this->doTestDoubleBytes( 'x', '' ); 00154 $this->doTestDoubleBytes( '', 'x' ); 00155 $this->doTestDoubleBytes( 'x', 'x' ); 00156 } 00157 00161 function doTestDoubleBytes( $head, $tail ) { 00162 for ( $first = 0xc0; $first < 0x100; $first += 2 ) { 00163 for ( $second = 0x80; $second < 0x100; $second += 2 ) { 00164 $char = $head . chr( $first ) . chr( $second ) . $tail; 00165 $clean = UtfNormal::cleanUp( $char ); 00166 $x = sprintf( "%02X,%02X", $first, $second ); 00167 if ( $first > 0xc1 && 00168 $first < 0xe0 && 00169 $second < 0xc0 00170 ) { 00171 $norm = UtfNormal::NFC( $char ); 00172 $this->assertEquals( 00173 bin2hex( $norm ), 00174 bin2hex( $clean ), 00175 "Pair $x should be intact" ); 00176 if ( $norm != $clean ) { 00177 return; 00178 } 00179 } elseif ( $first > 0xfd || $second > 0xbf ) { 00180 # fe and ff are not legal head bytes -- expect two replacement chars 00181 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail; 00182 $this->assertEquals( 00183 bin2hex( $norm ), 00184 bin2hex( $clean ), 00185 "Forbidden pair $x should be rejected" ); 00186 if ( $norm != $clean ) { 00187 return; 00188 } 00189 } else { 00190 $norm = $head . UTF8_REPLACEMENT . $tail; 00191 $this->assertEquals( 00192 bin2hex( $norm ), 00193 bin2hex( $clean ), 00194 "Forbidden pair $x should be rejected" ); 00195 if ( $norm != $clean ) { 00196 return; 00197 } 00198 } 00199 } 00200 } 00201 } 00202 00204 public function testTripleBytes() { 00205 $this->doTestTripleBytes( '', '' ); 00206 $this->doTestTripleBytes( 'x', '' ); 00207 $this->doTestTripleBytes( '', 'x' ); 00208 $this->doTestTripleBytes( 'x', 'x' ); 00209 } 00210 00212 function doTestTripleBytes( $head, $tail ) { 00213 for ( $first = 0xc0; $first < 0x100; $first += 2 ) { 00214 for ( $second = 0x80; $second < 0x100; $second += 2 ) { 00215 #for( $third = 0x80; $third < 0x100; $third++ ) { 00216 for ( $third = 0x80; $third < 0x81; $third++ ) { 00217 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail; 00218 $clean = UtfNormal::cleanUp( $char ); 00219 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third ); 00220 00221 if ( $first >= 0xe0 && 00222 $first < 0xf0 && 00223 $second < 0xc0 && 00224 $third < 0xc0 00225 ) { 00226 if ( $first == 0xe0 && $second < 0xa0 ) { 00227 $this->assertEquals( 00228 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00229 bin2hex( $clean ), 00230 "Overlong triplet $x should be rejected" ); 00231 } elseif ( $first == 0xed && 00232 ( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST 00233 ) { 00234 $this->assertEquals( 00235 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00236 bin2hex( $clean ), 00237 "Surrogate triplet $x should be rejected" ); 00238 } else { 00239 $this->assertEquals( 00240 bin2hex( UtfNormal::NFC( $char ) ), 00241 bin2hex( $clean ), 00242 "Triplet $x should be intact" ); 00243 } 00244 } elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) { 00245 $this->assertEquals( 00246 bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ), 00247 bin2hex( $clean ), 00248 "Valid 2-byte $x + broken tail" ); 00249 } elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) { 00250 $this->assertEquals( 00251 bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ), 00252 bin2hex( $clean ), 00253 "Broken head + valid 2-byte $x" ); 00254 } elseif ( ( $first > 0xfd || $second > 0xfd ) && 00255 ( ( $second > 0xbf && $third > 0xbf ) || 00256 ( $second < 0xc0 && $third < 0xc0 ) || 00257 ( $second > 0xfd ) || 00258 ( $third > 0xfd ) ) 00259 ) { 00260 # fe and ff are not legal head bytes -- expect three replacement chars 00261 $this->assertEquals( 00262 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), 00263 bin2hex( $clean ), 00264 "Forbidden triplet $x should be rejected" ); 00265 } elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) { 00266 $this->assertEquals( 00267 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00268 bin2hex( $clean ), 00269 "Forbidden triplet $x should be rejected" ); 00270 } else { 00271 $this->assertEquals( 00272 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), 00273 bin2hex( $clean ), 00274 "Forbidden triplet $x should be rejected" ); 00275 } 00276 } 00277 } 00278 } 00279 } 00280 00282 public function testChunkRegression() { 00283 # Check for regression against a chunking bug 00284 $text = "\x46\x55\xb8" . 00285 "\xdc\x96" . 00286 "\xee" . 00287 "\xe7" . 00288 "\x44" . 00289 "\xaa" . 00290 "\x2f\x25"; 00291 $expect = "\x46\x55\xef\xbf\xbd" . 00292 "\xdc\x96" . 00293 "\xef\xbf\xbd" . 00294 "\xef\xbf\xbd" . 00295 "\x44" . 00296 "\xef\xbf\xbd" . 00297 "\x2f\x25"; 00298 00299 $this->assertEquals( 00300 bin2hex( $expect ), 00301 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00302 } 00303 00305 public function testInterposeRegression() { 00306 $text = "\x4e\x30" . 00307 "\xb1" . # bad tail 00308 "\x3a" . 00309 "\x92" . # bad tail 00310 "\x62\x3a" . 00311 "\x84" . # bad tail 00312 "\x43" . 00313 "\xc6" . # bad head 00314 "\x3f" . 00315 "\x92" . # bad tail 00316 "\xad" . # bad tail 00317 "\x7d" . 00318 "\xd9\x95"; 00319 00320 $expect = "\x4e\x30" . 00321 "\xef\xbf\xbd" . 00322 "\x3a" . 00323 "\xef\xbf\xbd" . 00324 "\x62\x3a" . 00325 "\xef\xbf\xbd" . 00326 "\x43" . 00327 "\xef\xbf\xbd" . 00328 "\x3f" . 00329 "\xef\xbf\xbd" . 00330 "\xef\xbf\xbd" . 00331 "\x7d" . 00332 "\xd9\x95"; 00333 00334 $this->assertEquals( 00335 bin2hex( $expect ), 00336 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00337 } 00338 00340 public function testOverlongRegression() { 00341 $text = "\x67" . 00342 "\x1a" . # forbidden ascii 00343 "\xea" . # bad head 00344 "\xc1\xa6" . # overlong sequence 00345 "\xad" . # bad tail 00346 "\x1c" . # forbidden ascii 00347 "\xb0" . # bad tail 00348 "\x3c" . 00349 "\x9e"; # bad tail 00350 $expect = "\x67" . 00351 "\xef\xbf\xbd" . 00352 "\xef\xbf\xbd" . 00353 "\xef\xbf\xbd" . 00354 "\xef\xbf\xbd" . 00355 "\xef\xbf\xbd" . 00356 "\xef\xbf\xbd" . 00357 "\x3c" . 00358 "\xef\xbf\xbd"; 00359 $this->assertEquals( 00360 bin2hex( $expect ), 00361 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00362 } 00363 00365 public function testSurrogateRegression() { 00366 $text = "\xed\xb4\x96" . # surrogate 0xDD16 00367 "\x83" . # bad tail 00368 "\xb4" . # bad tail 00369 "\xac"; # bad head 00370 $expect = "\xef\xbf\xbd" . 00371 "\xef\xbf\xbd" . 00372 "\xef\xbf\xbd" . 00373 "\xef\xbf\xbd"; 00374 $this->assertEquals( 00375 bin2hex( $expect ), 00376 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00377 } 00378 00380 public function testBomRegression() { 00381 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char 00382 "\xb2" . # bad tail 00383 "\xef" . # bad head 00384 "\x59"; 00385 $expect = "\xef\xbf\xbd" . 00386 "\xef\xbf\xbd" . 00387 "\xef\xbf\xbd" . 00388 "\x59"; 00389 $this->assertEquals( 00390 bin2hex( $expect ), 00391 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00392 } 00393 00395 public function testForbiddenRegression() { 00396 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char 00397 $expect = "\xef\xbf\xbd"; 00398 $this->assertEquals( 00399 bin2hex( $expect ), 00400 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00401 } 00402 00404 public function testHangulRegression() { 00405 $text = "\xed\x9c\xaf" . # Hangul char 00406 "\xe1\x87\x81"; # followed by another final jamo 00407 $expect = $text; # Should *not* change. 00408 $this->assertEquals( 00409 bin2hex( $expect ), 00410 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00411 } 00412 }