MediaWiki
REL1_19
|
00001 <?php 00033 class CleanUpTest extends MediaWikiTestCase { 00035 function testAscii() { 00036 $text = 'This is plain ASCII text.'; 00037 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) ); 00038 } 00039 00041 function testNull() { 00042 $text = "a \x00 null"; 00043 $expect = "a \xef\xbf\xbd null"; 00044 $this->assertEquals( 00045 bin2hex( $expect ), 00046 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00047 } 00048 00050 function testLatin() { 00051 $text = "L'\xc3\xa9cole"; 00052 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) ); 00053 } 00054 00056 function testLatinNormal() { 00057 $text = "L'e\xcc\x81cole"; 00058 $expect = "L'\xc3\xa9cole"; 00059 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) ); 00060 } 00061 00066 function XtestAllChars() { 00067 $rep = UTF8_REPLACEMENT; 00068 for( $i = 0x0; $i < UNICODE_MAX; $i++ ) { 00069 $char = codepointToUtf8( $i ); 00070 $clean = UtfNormal::cleanUp( $char ); 00071 $x = sprintf( "%04X", $i ); 00072 if( $i % 0x1000 == 0 ) echo "U+$x\n"; 00073 if( $i == 0x0009 || 00074 $i == 0x000a || 00075 $i == 0x000d || 00076 ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) || 00077 ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) || 00078 ($i > 0xffff && $i <= UNICODE_MAX ) ) { 00079 if( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) { 00080 $comp = UtfNormal::NFC( $char ); 00081 $this->assertEquals( 00082 bin2hex( $comp ), 00083 bin2hex( $clean ), 00084 "U+$x should be decomposed" ); 00085 } else { 00086 $this->assertEquals( 00087 bin2hex( $char ), 00088 bin2hex( $clean ), 00089 "U+$x should be intact" ); 00090 } 00091 } else { 00092 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x ); 00093 } 00094 } 00095 } 00096 00098 function testAllBytes() { 00099 $this->doTestBytes( '', '' ); 00100 $this->doTestBytes( 'x', '' ); 00101 $this->doTestBytes( '', 'x' ); 00102 $this->doTestBytes( 'x', 'x' ); 00103 } 00104 00106 function doTestBytes( $head, $tail ) { 00107 for( $i = 0x0; $i < 256; $i++ ) { 00108 $char = $head . chr( $i ) . $tail; 00109 $clean = UtfNormal::cleanUp( $char ); 00110 $x = sprintf( "%02X", $i ); 00111 if( $i == 0x0009 || 00112 $i == 0x000a || 00113 $i == 0x000d || 00114 ($i > 0x001f && $i < 0x80) ) { 00115 $this->assertEquals( 00116 bin2hex( $char ), 00117 bin2hex( $clean ), 00118 "ASCII byte $x should be intact" ); 00119 if( $char != $clean ) return; 00120 } else { 00121 $norm = $head . UTF8_REPLACEMENT . $tail; 00122 $this->assertEquals( 00123 bin2hex( $norm ), 00124 bin2hex( $clean ), 00125 "Forbidden byte $x should be rejected" ); 00126 if( $norm != $clean ) return; 00127 } 00128 } 00129 } 00130 00132 function testDoubleBytes() { 00133 $this->doTestDoubleBytes( '', '' ); 00134 $this->doTestDoubleBytes( 'x', '' ); 00135 $this->doTestDoubleBytes( '', 'x' ); 00136 $this->doTestDoubleBytes( 'x', 'x' ); 00137 } 00138 00142 function doTestDoubleBytes( $head, $tail ) { 00143 for( $first = 0xc0; $first < 0x100; $first+=2 ) { 00144 for( $second = 0x80; $second < 0x100; $second+=2 ) { 00145 $char = $head . chr( $first ) . chr( $second ) . $tail; 00146 $clean = UtfNormal::cleanUp( $char ); 00147 $x = sprintf( "%02X,%02X", $first, $second ); 00148 if( $first > 0xc1 && 00149 $first < 0xe0 && 00150 $second < 0xc0 ) { 00151 $norm = UtfNormal::NFC( $char ); 00152 $this->assertEquals( 00153 bin2hex( $norm ), 00154 bin2hex( $clean ), 00155 "Pair $x should be intact" ); 00156 if( $norm != $clean ) return; 00157 } elseif( $first > 0xfd || $second > 0xbf ) { 00158 # fe and ff are not legal head bytes -- expect two replacement chars 00159 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail; 00160 $this->assertEquals( 00161 bin2hex( $norm ), 00162 bin2hex( $clean ), 00163 "Forbidden pair $x should be rejected" ); 00164 if( $norm != $clean ) return; 00165 } else { 00166 $norm = $head . UTF8_REPLACEMENT . $tail; 00167 $this->assertEquals( 00168 bin2hex( $norm ), 00169 bin2hex( $clean ), 00170 "Forbidden pair $x should be rejected" ); 00171 if( $norm != $clean ) return; 00172 } 00173 } 00174 } 00175 } 00176 00178 function testTripleBytes() { 00179 $this->doTestTripleBytes( '', '' ); 00180 $this->doTestTripleBytes( 'x', '' ); 00181 $this->doTestTripleBytes( '', 'x' ); 00182 $this->doTestTripleBytes( 'x', 'x' ); 00183 } 00184 00186 function doTestTripleBytes( $head, $tail ) { 00187 for( $first = 0xc0; $first < 0x100; $first+=2 ) { 00188 for( $second = 0x80; $second < 0x100; $second+=2 ) { 00189 #for( $third = 0x80; $third < 0x100; $third++ ) { 00190 for( $third = 0x80; $third < 0x81; $third++ ) { 00191 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail; 00192 $clean = UtfNormal::cleanUp( $char ); 00193 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third ); 00194 if( $first >= 0xe0 && 00195 $first < 0xf0 && 00196 $second < 0xc0 && 00197 $third < 0xc0 ) { 00198 if( $first == 0xe0 && $second < 0xa0 ) { 00199 $this->assertEquals( 00200 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00201 bin2hex( $clean ), 00202 "Overlong triplet $x should be rejected" ); 00203 } elseif( $first == 0xed && 00204 ( chr( $first ) . chr( $second ) . chr( $third )) >= UTF8_SURROGATE_FIRST ) { 00205 $this->assertEquals( 00206 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00207 bin2hex( $clean ), 00208 "Surrogate triplet $x should be rejected" ); 00209 } else { 00210 $this->assertEquals( 00211 bin2hex( UtfNormal::NFC( $char ) ), 00212 bin2hex( $clean ), 00213 "Triplet $x should be intact" ); 00214 } 00215 } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) { 00216 $this->assertEquals( 00217 bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ), 00218 bin2hex( $clean ), 00219 "Valid 2-byte $x + broken tail" ); 00220 } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) { 00221 $this->assertEquals( 00222 bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ), 00223 bin2hex( $clean ), 00224 "Broken head + valid 2-byte $x" ); 00225 } elseif( ( $first > 0xfd || $second > 0xfd ) && 00226 ( ( $second > 0xbf && $third > 0xbf ) || 00227 ( $second < 0xc0 && $third < 0xc0 ) || 00228 ( $second > 0xfd ) || 00229 ( $third > 0xfd ) ) ) { 00230 # fe and ff are not legal head bytes -- expect three replacement chars 00231 $this->assertEquals( 00232 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), 00233 bin2hex( $clean ), 00234 "Forbidden triplet $x should be rejected" ); 00235 } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) { 00236 $this->assertEquals( 00237 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00238 bin2hex( $clean ), 00239 "Forbidden triplet $x should be rejected" ); 00240 } else { 00241 $this->assertEquals( 00242 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), 00243 bin2hex( $clean ), 00244 "Forbidden triplet $x should be rejected" ); 00245 } 00246 } 00247 } 00248 } 00249 } 00250 00252 function testChunkRegression() { 00253 # Check for regression against a chunking bug 00254 $text = "\x46\x55\xb8" . 00255 "\xdc\x96" . 00256 "\xee" . 00257 "\xe7" . 00258 "\x44" . 00259 "\xaa" . 00260 "\x2f\x25"; 00261 $expect = "\x46\x55\xef\xbf\xbd" . 00262 "\xdc\x96" . 00263 "\xef\xbf\xbd" . 00264 "\xef\xbf\xbd" . 00265 "\x44" . 00266 "\xef\xbf\xbd" . 00267 "\x2f\x25"; 00268 00269 $this->assertEquals( 00270 bin2hex( $expect ), 00271 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00272 } 00273 00275 function testInterposeRegression() { 00276 $text = "\x4e\x30" . 00277 "\xb1" . # bad tail 00278 "\x3a" . 00279 "\x92" . # bad tail 00280 "\x62\x3a" . 00281 "\x84" . # bad tail 00282 "\x43" . 00283 "\xc6" . # bad head 00284 "\x3f" . 00285 "\x92" . # bad tail 00286 "\xad" . # bad tail 00287 "\x7d" . 00288 "\xd9\x95"; 00289 00290 $expect = "\x4e\x30" . 00291 "\xef\xbf\xbd" . 00292 "\x3a" . 00293 "\xef\xbf\xbd" . 00294 "\x62\x3a" . 00295 "\xef\xbf\xbd" . 00296 "\x43" . 00297 "\xef\xbf\xbd" . 00298 "\x3f" . 00299 "\xef\xbf\xbd" . 00300 "\xef\xbf\xbd" . 00301 "\x7d" . 00302 "\xd9\x95"; 00303 00304 $this->assertEquals( 00305 bin2hex( $expect ), 00306 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00307 } 00308 00310 function testOverlongRegression() { 00311 $text = "\x67" . 00312 "\x1a" . # forbidden ascii 00313 "\xea" . # bad head 00314 "\xc1\xa6" . # overlong sequence 00315 "\xad" . # bad tail 00316 "\x1c" . # forbidden ascii 00317 "\xb0" . # bad tail 00318 "\x3c" . 00319 "\x9e"; # bad tail 00320 $expect = "\x67" . 00321 "\xef\xbf\xbd" . 00322 "\xef\xbf\xbd" . 00323 "\xef\xbf\xbd" . 00324 "\xef\xbf\xbd" . 00325 "\xef\xbf\xbd" . 00326 "\xef\xbf\xbd" . 00327 "\x3c" . 00328 "\xef\xbf\xbd"; 00329 $this->assertEquals( 00330 bin2hex( $expect ), 00331 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00332 } 00333 00335 function testSurrogateRegression() { 00336 $text = "\xed\xb4\x96" . # surrogate 0xDD16 00337 "\x83" . # bad tail 00338 "\xb4" . # bad tail 00339 "\xac"; # bad head 00340 $expect = "\xef\xbf\xbd" . 00341 "\xef\xbf\xbd" . 00342 "\xef\xbf\xbd" . 00343 "\xef\xbf\xbd"; 00344 $this->assertEquals( 00345 bin2hex( $expect ), 00346 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00347 } 00348 00350 function testBomRegression() { 00351 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char 00352 "\xb2" . # bad tail 00353 "\xef" . # bad head 00354 "\x59"; 00355 $expect = "\xef\xbf\xbd" . 00356 "\xef\xbf\xbd" . 00357 "\xef\xbf\xbd" . 00358 "\x59"; 00359 $this->assertEquals( 00360 bin2hex( $expect ), 00361 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00362 } 00363 00365 function testForbiddenRegression() { 00366 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char 00367 $expect = "\xef\xbf\xbd"; 00368 $this->assertEquals( 00369 bin2hex( $expect ), 00370 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00371 } 00372 00374 function testHangulRegression() { 00375 $text = "\xed\x9c\xaf" . # Hangul char 00376 "\xe1\x87\x81"; # followed by another final jamo 00377 $expect = $text; # Should *not* change. 00378 $this->assertEquals( 00379 bin2hex( $expect ), 00380 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00381 } 00382 }