MediaWiki
REL1_19
|
00001 <?php 00031 class Sanitizer { 00036 const CHAR_REFS_REGEX = 00037 '/&([A-Za-z0-9\x80-\xff]+); 00038 |&\#([0-9]+); 00039 |&\#[xX]([0-9A-Fa-f]+); 00040 |(&)/x'; 00041 00050 const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; 00051 const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; 00052 00059 static $htmlEntities = array( 00060 'Aacute' => 193, 00061 'aacute' => 225, 00062 'Acirc' => 194, 00063 'acirc' => 226, 00064 'acute' => 180, 00065 'AElig' => 198, 00066 'aelig' => 230, 00067 'Agrave' => 192, 00068 'agrave' => 224, 00069 'alefsym' => 8501, 00070 'Alpha' => 913, 00071 'alpha' => 945, 00072 'amp' => 38, 00073 'and' => 8743, 00074 'ang' => 8736, 00075 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. 00076 'Aring' => 197, 00077 'aring' => 229, 00078 'asymp' => 8776, 00079 'Atilde' => 195, 00080 'atilde' => 227, 00081 'Auml' => 196, 00082 'auml' => 228, 00083 'bdquo' => 8222, 00084 'Beta' => 914, 00085 'beta' => 946, 00086 'brvbar' => 166, 00087 'bull' => 8226, 00088 'cap' => 8745, 00089 'Ccedil' => 199, 00090 'ccedil' => 231, 00091 'cedil' => 184, 00092 'cent' => 162, 00093 'Chi' => 935, 00094 'chi' => 967, 00095 'circ' => 710, 00096 'clubs' => 9827, 00097 'cong' => 8773, 00098 'copy' => 169, 00099 'crarr' => 8629, 00100 'cup' => 8746, 00101 'curren' => 164, 00102 'dagger' => 8224, 00103 'Dagger' => 8225, 00104 'darr' => 8595, 00105 'dArr' => 8659, 00106 'deg' => 176, 00107 'Delta' => 916, 00108 'delta' => 948, 00109 'diams' => 9830, 00110 'divide' => 247, 00111 'Eacute' => 201, 00112 'eacute' => 233, 00113 'Ecirc' => 202, 00114 'ecirc' => 234, 00115 'Egrave' => 200, 00116 'egrave' => 232, 00117 'empty' => 8709, 00118 'emsp' => 8195, 00119 'ensp' => 8194, 00120 'Epsilon' => 917, 00121 'epsilon' => 949, 00122 'equiv' => 8801, 00123 'Eta' => 919, 00124 'eta' => 951, 00125 'ETH' => 208, 00126 'eth' => 240, 00127 'Euml' => 203, 00128 'euml' => 235, 00129 'euro' => 8364, 00130 'exist' => 8707, 00131 'fnof' => 402, 00132 'forall' => 8704, 00133 'frac12' => 189, 00134 'frac14' => 188, 00135 'frac34' => 190, 00136 'frasl' => 8260, 00137 'Gamma' => 915, 00138 'gamma' => 947, 00139 'ge' => 8805, 00140 'gt' => 62, 00141 'harr' => 8596, 00142 'hArr' => 8660, 00143 'hearts' => 9829, 00144 'hellip' => 8230, 00145 'Iacute' => 205, 00146 'iacute' => 237, 00147 'Icirc' => 206, 00148 'icirc' => 238, 00149 'iexcl' => 161, 00150 'Igrave' => 204, 00151 'igrave' => 236, 00152 'image' => 8465, 00153 'infin' => 8734, 00154 'int' => 8747, 00155 'Iota' => 921, 00156 'iota' => 953, 00157 'iquest' => 191, 00158 'isin' => 8712, 00159 'Iuml' => 207, 00160 'iuml' => 239, 00161 'Kappa' => 922, 00162 'kappa' => 954, 00163 'Lambda' => 923, 00164 'lambda' => 955, 00165 'lang' => 9001, 00166 'laquo' => 171, 00167 'larr' => 8592, 00168 'lArr' => 8656, 00169 'lceil' => 8968, 00170 'ldquo' => 8220, 00171 'le' => 8804, 00172 'lfloor' => 8970, 00173 'lowast' => 8727, 00174 'loz' => 9674, 00175 'lrm' => 8206, 00176 'lsaquo' => 8249, 00177 'lsquo' => 8216, 00178 'lt' => 60, 00179 'macr' => 175, 00180 'mdash' => 8212, 00181 'micro' => 181, 00182 'middot' => 183, 00183 'minus' => 8722, 00184 'Mu' => 924, 00185 'mu' => 956, 00186 'nabla' => 8711, 00187 'nbsp' => 160, 00188 'ndash' => 8211, 00189 'ne' => 8800, 00190 'ni' => 8715, 00191 'not' => 172, 00192 'notin' => 8713, 00193 'nsub' => 8836, 00194 'Ntilde' => 209, 00195 'ntilde' => 241, 00196 'Nu' => 925, 00197 'nu' => 957, 00198 'Oacute' => 211, 00199 'oacute' => 243, 00200 'Ocirc' => 212, 00201 'ocirc' => 244, 00202 'OElig' => 338, 00203 'oelig' => 339, 00204 'Ograve' => 210, 00205 'ograve' => 242, 00206 'oline' => 8254, 00207 'Omega' => 937, 00208 'omega' => 969, 00209 'Omicron' => 927, 00210 'omicron' => 959, 00211 'oplus' => 8853, 00212 'or' => 8744, 00213 'ordf' => 170, 00214 'ordm' => 186, 00215 'Oslash' => 216, 00216 'oslash' => 248, 00217 'Otilde' => 213, 00218 'otilde' => 245, 00219 'otimes' => 8855, 00220 'Ouml' => 214, 00221 'ouml' => 246, 00222 'para' => 182, 00223 'part' => 8706, 00224 'permil' => 8240, 00225 'perp' => 8869, 00226 'Phi' => 934, 00227 'phi' => 966, 00228 'Pi' => 928, 00229 'pi' => 960, 00230 'piv' => 982, 00231 'plusmn' => 177, 00232 'pound' => 163, 00233 'prime' => 8242, 00234 'Prime' => 8243, 00235 'prod' => 8719, 00236 'prop' => 8733, 00237 'Psi' => 936, 00238 'psi' => 968, 00239 'quot' => 34, 00240 'radic' => 8730, 00241 'rang' => 9002, 00242 'raquo' => 187, 00243 'rarr' => 8594, 00244 'rArr' => 8658, 00245 'rceil' => 8969, 00246 'rdquo' => 8221, 00247 'real' => 8476, 00248 'reg' => 174, 00249 'rfloor' => 8971, 00250 'Rho' => 929, 00251 'rho' => 961, 00252 'rlm' => 8207, 00253 'rsaquo' => 8250, 00254 'rsquo' => 8217, 00255 'sbquo' => 8218, 00256 'Scaron' => 352, 00257 'scaron' => 353, 00258 'sdot' => 8901, 00259 'sect' => 167, 00260 'shy' => 173, 00261 'Sigma' => 931, 00262 'sigma' => 963, 00263 'sigmaf' => 962, 00264 'sim' => 8764, 00265 'spades' => 9824, 00266 'sub' => 8834, 00267 'sube' => 8838, 00268 'sum' => 8721, 00269 'sup' => 8835, 00270 'sup1' => 185, 00271 'sup2' => 178, 00272 'sup3' => 179, 00273 'supe' => 8839, 00274 'szlig' => 223, 00275 'Tau' => 932, 00276 'tau' => 964, 00277 'there4' => 8756, 00278 'Theta' => 920, 00279 'theta' => 952, 00280 'thetasym' => 977, 00281 'thinsp' => 8201, 00282 'THORN' => 222, 00283 'thorn' => 254, 00284 'tilde' => 732, 00285 'times' => 215, 00286 'trade' => 8482, 00287 'Uacute' => 218, 00288 'uacute' => 250, 00289 'uarr' => 8593, 00290 'uArr' => 8657, 00291 'Ucirc' => 219, 00292 'ucirc' => 251, 00293 'Ugrave' => 217, 00294 'ugrave' => 249, 00295 'uml' => 168, 00296 'upsih' => 978, 00297 'Upsilon' => 933, 00298 'upsilon' => 965, 00299 'Uuml' => 220, 00300 'uuml' => 252, 00301 'weierp' => 8472, 00302 'Xi' => 926, 00303 'xi' => 958, 00304 'Yacute' => 221, 00305 'yacute' => 253, 00306 'yen' => 165, 00307 'Yuml' => 376, 00308 'yuml' => 255, 00309 'Zeta' => 918, 00310 'zeta' => 950, 00311 'zwj' => 8205, 00312 'zwnj' => 8204 00313 ); 00314 00318 static $htmlEntityAliases = array( 00319 'רלמ' => 'rlm', 00320 'رلم' => 'rlm', 00321 ); 00322 00326 static $attribsRegex; 00327 00333 static function getAttribsRegex() { 00334 if ( self::$attribsRegex === null ) { 00335 $attribFirst = '[:A-Z_a-z0-9]'; 00336 $attrib = '[:A-Z_a-z-.0-9]'; 00337 $space = '[\x09\x0a\x0d\x20]'; 00338 self::$attribsRegex = 00339 "/(?:^|$space)({$attribFirst}{$attrib}*) 00340 ($space*=$space* 00341 (?: 00342 # The attribute value: quoted or alone 00343 \"([^<\"]*)\" 00344 | '([^<']*)' 00345 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) 00346 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of 00347 # colors are specified like this. 00348 # We'll be normalizing it. 00349 ) 00350 )?(?=$space|\$)/sx"; 00351 } 00352 return self::$attribsRegex; 00353 } 00354 00366 static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) { 00367 global $wgUseTidy; 00368 00369 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, 00370 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; 00371 00372 wfProfileIn( __METHOD__ ); 00373 00374 if ( !$staticInitialised ) { 00375 00376 $htmlpairsStatic = array( # Tags that must be closed 00377 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 00378 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 00379 'strike', 'strong', 'tt', 'var', 'div', 'center', 00380 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 00381 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn', 00382 'kbd', 'samp' 00383 ); 00384 $htmlsingle = array( 00385 'br', 'hr', 'li', 'dt', 'dd' 00386 ); 00387 $htmlsingleonly = array( # Elements that cannot have close tags 00388 'br', 'hr' 00389 ); 00390 $htmlnest = array( # Tags that can be nested--?? 00391 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 00392 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' 00393 ); 00394 $tabletags = array( # Can only appear inside table, we will close them 00395 'td', 'th', 'tr', 00396 ); 00397 $htmllist = array( # Tags used by list 00398 'ul','ol', 00399 ); 00400 $listtags = array( # Tags that can appear in a list 00401 'li', 00402 ); 00403 00404 global $wgAllowImageTag; 00405 if ( $wgAllowImageTag ) { 00406 $htmlsingle[] = 'img'; 00407 $htmlsingleonly[] = 'img'; 00408 } 00409 00410 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); 00411 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); 00412 00413 # Convert them all to hashtables for faster lookup 00414 $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 00415 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ); 00416 foreach ( $vars as $var ) { 00417 $$var = array_flip( $$var ); 00418 } 00419 $staticInitialised = true; 00420 } 00421 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays 00422 $extratags = array_flip( $extratags ); 00423 $removetags = array_flip( $removetags ); 00424 $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); 00425 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags ); 00426 00427 # Remove HTML comments 00428 $text = Sanitizer::removeHTMLcomments( $text ); 00429 $bits = explode( '<', $text ); 00430 $text = str_replace( '>', '>', array_shift( $bits ) ); 00431 if ( !$wgUseTidy ) { 00432 $tagstack = $tablestack = array(); 00433 foreach ( $bits as $x ) { 00434 $regs = array(); 00435 # $slash: Does the current element start with a '/'? 00436 # $t: Current element name 00437 # $params: String between element name and > 00438 # $brace: Ending '>' or '/>' 00439 # $rest: Everything until the next element of $bits 00440 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { 00441 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00442 } else { 00443 $slash = $t = $params = $brace = $rest = null; 00444 } 00445 00446 $badtag = false; 00447 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00448 # Check our stack 00449 if ( $slash && isset( $htmlsingleonly[$t] ) ) { 00450 $badtag = true; 00451 } elseif ( $slash ) { 00452 # Closing a tag... is it the one we just opened? 00453 $ot = @array_pop( $tagstack ); 00454 if ( $ot != $t ) { 00455 if ( isset( $htmlsingleallowed[$ot] ) ) { 00456 # Pop all elements with an optional close tag 00457 # and see if we find a match below them 00458 $optstack = array(); 00459 array_push( $optstack, $ot ); 00460 wfSuppressWarnings(); 00461 $ot = array_pop( $tagstack ); 00462 wfRestoreWarnings(); 00463 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { 00464 array_push( $optstack, $ot ); 00465 wfSuppressWarnings(); 00466 $ot = array_pop( $tagstack ); 00467 wfRestoreWarnings(); 00468 } 00469 if ( $t != $ot ) { 00470 # No match. Push the optional elements back again 00471 $badtag = true; 00472 wfSuppressWarnings(); 00473 $ot = array_pop( $optstack ); 00474 wfRestoreWarnings(); 00475 while ( $ot ) { 00476 array_push( $tagstack, $ot ); 00477 wfSuppressWarnings(); 00478 $ot = array_pop( $optstack ); 00479 wfRestoreWarnings(); 00480 } 00481 } 00482 } else { 00483 @array_push( $tagstack, $ot ); 00484 # <li> can be nested in <ul> or <ol>, skip those cases: 00485 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) { 00486 $badtag = true; 00487 } 00488 } 00489 } else { 00490 if ( $t == 'table' ) { 00491 $tagstack = array_pop( $tablestack ); 00492 } 00493 } 00494 $newparams = ''; 00495 } else { 00496 # Keep track for later 00497 if ( isset( $tabletags[$t] ) && 00498 !in_array( 'table', $tagstack ) ) { 00499 $badtag = true; 00500 } elseif ( in_array( $t, $tagstack ) && 00501 !isset( $htmlnest [$t ] ) ) { 00502 $badtag = true; 00503 # Is it a self closed htmlpair ? (bug 5487) 00504 } elseif ( $brace == '/>' && 00505 isset( $htmlpairs[$t] ) ) { 00506 $badtag = true; 00507 } elseif ( isset( $htmlsingleonly[$t] ) ) { 00508 # Hack to force empty tag for uncloseable elements 00509 $brace = '/>'; 00510 } elseif ( isset( $htmlsingle[$t] ) ) { 00511 # Hack to not close $htmlsingle tags 00512 $brace = null; 00513 } elseif ( isset( $tabletags[$t] ) 00514 && in_array( $t, $tagstack ) ) { 00515 // New table tag but forgot to close the previous one 00516 $text .= "</$t>"; 00517 } else { 00518 if ( $t == 'table' ) { 00519 array_push( $tablestack, $tagstack ); 00520 $tagstack = array(); 00521 } 00522 array_push( $tagstack, $t ); 00523 } 00524 00525 # Replace any variables or template parameters with 00526 # plaintext results. 00527 if( is_callable( $processCallback ) ) { 00528 call_user_func_array( $processCallback, array( &$params, $args ) ); 00529 } 00530 00531 # Strip non-approved attributes from the tag 00532 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00533 } 00534 if ( !$badtag ) { 00535 $rest = str_replace( '>', '>', $rest ); 00536 $close = ( $brace == '/>' && !$slash ) ? ' /' : ''; 00537 $text .= "<$slash$t$newparams$close>$rest"; 00538 continue; 00539 } 00540 } 00541 $text .= '<' . str_replace( '>', '>', $x); 00542 } 00543 # Close off any remaining tags 00544 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) { 00545 $text .= "</$t>\n"; 00546 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); } 00547 } 00548 } else { 00549 # this might be possible using tidy itself 00550 foreach ( $bits as $x ) { 00551 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', 00552 $x, $regs ); 00553 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00554 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00555 if( is_callable( $processCallback ) ) { 00556 call_user_func_array( $processCallback, array( &$params, $args ) ); 00557 } 00558 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00559 $rest = str_replace( '>', '>', $rest ); 00560 $text .= "<$slash$t$newparams$brace$rest"; 00561 } else { 00562 $text .= '<' . str_replace( '>', '>', $x); 00563 } 00564 } 00565 } 00566 wfProfileOut( __METHOD__ ); 00567 return $text; 00568 } 00569 00580 static function removeHTMLcomments( $text ) { 00581 wfProfileIn( __METHOD__ ); 00582 while (($start = strpos($text, '<!--')) !== false) { 00583 $end = strpos($text, '-->', $start + 4); 00584 if ($end === false) { 00585 # Unterminated comment; bail out 00586 break; 00587 } 00588 00589 $end += 3; 00590 00591 # Trim space and newline if the comment is both 00592 # preceded and followed by a newline 00593 $spaceStart = max($start - 1, 0); 00594 $spaceLen = $end - $spaceStart; 00595 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) { 00596 $spaceStart--; 00597 $spaceLen++; 00598 } 00599 while (substr($text, $spaceStart + $spaceLen, 1) === ' ') 00600 $spaceLen++; 00601 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") { 00602 # Remove the comment, leading and trailing 00603 # spaces, and leave only one newline. 00604 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1); 00605 } 00606 else { 00607 # Remove just the comment. 00608 $text = substr_replace($text, '', $start, $end - $start); 00609 } 00610 } 00611 wfProfileOut( __METHOD__ ); 00612 return $text; 00613 } 00614 00628 static function fixDeprecatedAttributes( $attribs, $element ) { 00629 global $wgHtml5, $wgCleanupPresentationalAttributes; 00630 00631 // presentational attributes were removed from html5, we can leave them 00632 // in when html5 is turned off 00633 if ( !$wgHtml5 || !$wgCleanupPresentationalAttributes ) { 00634 return $attribs; 00635 } 00636 00637 $table = array( 'table' ); 00638 $cells = array( 'td', 'th' ); 00639 $colls = array( 'col', 'colgroup' ); 00640 $tblocks = array( 'tbody', 'tfoot', 'thead' ); 00641 $h = array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ); 00642 00643 $presentationalAttribs = array( 00644 'align' => array( 'text-align', array_merge( array( 'caption', 'hr', 'div', 'p', 'tr' ), $table, $cells, $colls, $tblocks, $h ) ), 00645 'clear' => array( 'clear', array( 'br' ) ), 00646 'height' => array( 'height', $cells ), 00647 'nowrap' => array( 'white-space', $cells ), 00648 'size' => array( 'height', array( 'hr' ) ), 00649 'type' => array( 'list-style-type', array( 'li', 'ol', 'ul' ) ), 00650 'valign' => array( 'vertical-align', array_merge( $cells, $colls, $tblocks ) ), 00651 'width' => array( 'width', array_merge( array( 'hr', 'pre' ), $table, $cells, $colls ) ), 00652 ); 00653 00654 // Ensure that any upper case or mixed case attributes are converted to lowercase 00655 foreach ( $attribs as $attribute => $value ) { 00656 if ( $attribute !== strtolower( $attribute ) && array_key_exists( strtolower( $attribute ), $presentationalAttribs ) ) { 00657 $attribs[strtolower( $attribute )] = $value; 00658 unset( $attribs[$attribute] ); 00659 } 00660 } 00661 00662 $style = ""; 00663 foreach ( $presentationalAttribs as $attribute => $info ) { 00664 list( $property, $elements ) = $info; 00665 00666 // Skip if this attribute is not relevant to this element 00667 if ( !in_array( $element, $elements ) ) { 00668 continue; 00669 } 00670 00671 // Skip if the attribute is not used 00672 if ( !array_key_exists( $attribute, $attribs ) ) { 00673 continue; 00674 } 00675 00676 $value = $attribs[$attribute]; 00677 00678 // For nowrap the value should be nowrap instead of whatever text is in the value 00679 if ( $attribute === 'nowrap' ) { 00680 $value = 'nowrap'; 00681 } 00682 00683 // clear="all" is clear: both; in css 00684 if ( $attribute === 'clear' && strtolower( $value ) === 'all' ) { 00685 $value = 'both'; 00686 } 00687 00688 // Size based properties should have px applied to them if they have no unit 00689 if ( in_array( $attribute, array( 'height', 'width', 'size' ) ) ) { 00690 if ( preg_match( '/^[\d.]+$/', $value ) ) { 00691 $value = "{$value}px"; 00692 } 00693 } 00694 00695 $style .= " $property: $value;"; 00696 00697 unset( $attribs[$attribute] ); 00698 } 00699 00700 if ( $style ) { 00701 // Prepend our style rules so that they can be overridden by user css 00702 if ( isset($attribs['style']) ) { 00703 $style .= " " . $attribs['style']; 00704 } 00705 $attribs['style'] = trim($style); 00706 } 00707 00708 return $attribs; 00709 } 00710 00726 static function validateTagAttributes( $attribs, $element ) { 00727 return Sanitizer::validateAttributes( $attribs, 00728 Sanitizer::attributeWhitelist( $element ) ); 00729 } 00730 00746 static function validateAttributes( $attribs, $whitelist ) { 00747 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5; 00748 00749 $whitelist = array_flip( $whitelist ); 00750 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; 00751 00752 $out = array(); 00753 foreach( $attribs as $attribute => $value ) { 00754 #allow XML namespace declaration if RDFa is enabled 00755 if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { 00756 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00757 $out[$attribute] = $value; 00758 } 00759 00760 continue; 00761 } 00762 00763 # Allow any attribute beginning with "data-", if in HTML5 mode 00764 if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) { 00765 continue; 00766 } 00767 00768 # Strip javascript "expression" from stylesheets. 00769 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp 00770 if( $attribute == 'style' ) { 00771 $value = Sanitizer::checkCss( $value ); 00772 } 00773 00774 if ( $attribute === 'id' ) { 00775 $value = Sanitizer::escapeId( $value, 'noninitial' ); 00776 } 00777 00778 //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity 00779 if ( $attribute === 'rel' || $attribute === 'rev' || 00780 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa 00781 $attribute === 'datatype' || $attribute === 'typeof' || #RDFa 00782 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata 00783 $attribute === 'itemscope' || $attribute === 'itemtype' ) { #HTML5 microdata 00784 00785 //Paranoia. Allow "simple" values but suppress javascript 00786 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00787 continue; 00788 } 00789 } 00790 00791 # NOTE: even though elements using href/src are not allowed directly, supply 00792 # validation code that can be used by tag hook handlers, etc 00793 if ( $attribute === 'href' || $attribute === 'src' ) { 00794 if ( !preg_match( $hrefExp, $value ) ) { 00795 continue; //drop any href or src attributes not using an allowed protocol. 00796 //NOTE: this also drops all relative URLs 00797 } 00798 } 00799 00800 // If this attribute was previously set, override it. 00801 // Output should only have one attribute of each name. 00802 $out[$attribute] = $value; 00803 } 00804 00805 if ( $wgAllowMicrodataAttributes ) { 00806 # itemtype, itemid, itemref don't make sense without itemscope 00807 if ( !array_key_exists( 'itemscope', $out ) ) { 00808 unset( $out['itemtype'] ); 00809 unset( $out['itemid'] ); 00810 unset( $out['itemref'] ); 00811 } 00812 # TODO: Strip itemprop if we aren't descendants of an itemscope. 00813 } 00814 return $out; 00815 } 00816 00827 static function mergeAttributes( $a, $b ) { 00828 $out = array_merge( $a, $b ); 00829 if( isset( $a['class'] ) && isset( $b['class'] ) 00830 && is_string( $a['class'] ) && is_string( $b['class'] ) 00831 && $a['class'] !== $b['class'] ) { 00832 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", 00833 -1, PREG_SPLIT_NO_EMPTY ); 00834 $out['class'] = implode( ' ', array_unique( $classes ) ); 00835 } 00836 return $out; 00837 } 00838 00848 public static function normalizeCss( $value ) { 00849 00850 // Decode character references like { 00851 $value = Sanitizer::decodeCharReferences( $value ); 00852 00853 // Decode escape sequences and line continuation 00854 // See the grammar in the CSS 2 spec, appendix D. 00855 // This has to be done AFTER decoding character references. 00856 // This means it isn't possible for this function to return 00857 // unsanitized escape sequences. It is possible to manufacture 00858 // input that contains character references that decode to 00859 // escape sequences that decode to character references, but 00860 // it's OK for the return value to contain character references 00861 // because the caller is supposed to escape those anyway. 00862 static $decodeRegex; 00863 if ( !$decodeRegex ) { 00864 $space = '[\\x20\\t\\r\\n\\f]'; 00865 $nl = '(?:\\n|\\r\\n|\\r|\\f)'; 00866 $backslash = '\\\\'; 00867 $decodeRegex = "/ $backslash 00868 (?: 00869 ($nl) | # 1. Line continuation 00870 ([0-9A-Fa-f]{1,6})$space? | # 2. character number 00871 (.) | # 3. backslash cancelling special meaning 00872 () | # 4. backslash at end of string 00873 )/xu"; 00874 } 00875 $value = preg_replace_callback( $decodeRegex, 00876 array( __CLASS__, 'cssDecodeCallback' ), $value ); 00877 00878 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii 00879 $value = preg_replace_callback( 00880 '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088) 00881 array( __CLASS__, 'cssNormalizeUnicodeWidth' ), 00882 $value 00883 ); 00884 00885 // Convert more characters IE6 might treat as ascii 00886 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D 00887 $value = str_replace( 00888 array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ), 00889 array( 'r', 'n', 'n', 'l', 'i', '(', '(' ), 00890 $value 00891 ); 00892 00893 // Remove any comments; IE gets token splitting wrong 00894 // This must be done AFTER decoding character references and 00895 // escape sequences, because those steps can introduce comments 00896 // This step cannot introduce character references or escape 00897 // sequences, because it replaces comments with spaces rather 00898 // than removing them completely. 00899 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); 00900 00901 // Remove anything after a comment-start token, to guard against 00902 // incorrect client implementations. 00903 $commentPos = strpos( $value, '/*' ); 00904 if ( $commentPos !== false ) { 00905 $value = substr( $value, 0, $commentPos ); 00906 } 00907 00908 // S followed by repeat, iteration, or prolonged sound marks, 00909 // which IE will treat as "ss" 00910 $value = preg_replace( 00911 '/s(?: 00912 \xE3\x80\xB1 | # U+3031 00913 \xE3\x82\x9D | # U+309D 00914 \xE3\x83\xBC | # U+30FC 00915 \xE3\x83\xBD | # U+30FD 00916 \xEF\xB9\xBC | # U+FE7C 00917 \xEF\xB9\xBD | # U+FE7D 00918 \xEF\xBD\xB0 # U+FF70 00919 )/ix', 00920 'ss', 00921 $value 00922 ); 00923 00924 return $value; 00925 } 00926 00927 00946 static function checkCss( $value ) { 00947 $value = self::normalizeCss( $value ); 00948 00949 // Reject problematic keywords and control characters 00950 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) { 00951 return '/* invalid control char */'; 00952 } elseif ( preg_match( 00953 '! expression 00954 | filter\s*: 00955 | accelerator\s*: 00956 | -o-link\s*: 00957 | -o-link-source\s*: 00958 | -o-replace\s*: 00959 | url\s*\( 00960 | image\s*\( 00961 | image-set\s*\( 00962 !ix', $value ) ) { return '/* insecure input */'; 00963 } 00964 return $value; 00965 } 00966 00972 static function cssNormalizeUnicodeWidth( $matches ) { 00973 $cp = utf8ToCodepoint( $matches[0] ); 00974 if ( $cp === false ) { 00975 return ''; 00976 } 00977 return chr( $cp - 65248 ); // ASCII range \x21-\x7A 00978 } 00979 00984 static function cssDecodeCallback( $matches ) { 00985 if ( $matches[1] !== '' ) { 00986 // Line continuation 00987 return ''; 00988 } elseif ( $matches[2] !== '' ) { 00989 $char = codepointToUtf8( hexdec( $matches[2] ) ); 00990 } elseif ( $matches[3] !== '' ) { 00991 $char = $matches[3]; 00992 } else { 00993 $char = '\\'; 00994 } 00995 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { 00996 // These characters need to be escaped in strings 00997 // Clean up the escape sequence to avoid parsing errors by clients 00998 return '\\' . dechex( ord( $char ) ) . ' '; 00999 } else { 01000 // Decode unnecessary escape 01001 return $char; 01002 } 01003 } 01004 01024 static function fixTagAttributes( $text, $element ) { 01025 if( trim( $text ) == '' ) { 01026 return ''; 01027 } 01028 01029 $decoded = Sanitizer::decodeTagAttributes( $text ); 01030 $decoded = Sanitizer::fixDeprecatedAttributes( $decoded, $element ); 01031 $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); 01032 01033 $attribs = array(); 01034 foreach( $stripped as $attribute => $value ) { 01035 $encAttribute = htmlspecialchars( $attribute ); 01036 $encValue = Sanitizer::safeEncodeAttribute( $value ); 01037 01038 $attribs[] = "$encAttribute=\"$encValue\""; 01039 } 01040 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; 01041 } 01042 01048 static function encodeAttribute( $text ) { 01049 $encValue = htmlspecialchars( $text, ENT_QUOTES ); 01050 01051 // Whitespace is normalized during attribute decoding, 01052 // so if we've been passed non-spaces we must encode them 01053 // ahead of time or they won't be preserved. 01054 $encValue = strtr( $encValue, array( 01055 "\n" => ' ', 01056 "\r" => ' ', 01057 "\t" => '	', 01058 ) ); 01059 01060 return $encValue; 01061 } 01062 01069 static function safeEncodeAttribute( $text ) { 01070 $encValue = Sanitizer::encodeAttribute( $text ); 01071 01072 # Templates and links may be expanded in later parsing, 01073 # creating invalid or dangerous output. Suppress this. 01074 $encValue = strtr( $encValue, array( 01075 '<' => '<', // This should never happen, 01076 '>' => '>', // we've received invalid input 01077 '"' => '"', // which should have been escaped. 01078 '{' => '{', 01079 '[' => '[', 01080 "''" => '''', 01081 'ISBN' => 'ISBN', 01082 'RFC' => 'RFC', 01083 'PMID' => 'PMID', 01084 '|' => '|', 01085 '__' => '__', 01086 ) ); 01087 01088 # Stupid hack 01089 $encValue = preg_replace_callback( 01090 '/(' . wfUrlProtocols() . ')/', 01091 array( 'Sanitizer', 'armorLinksCallback' ), 01092 $encValue ); 01093 return $encValue; 01094 } 01095 01127 static function escapeId( $id, $options = array() ) { 01128 global $wgHtml5, $wgExperimentalHtmlIds; 01129 $options = (array)$options; 01130 01131 if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { 01132 $id = Sanitizer::decodeCharReferences( $id ); 01133 $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); 01134 $id = trim( $id, '_' ); 01135 if ( $id === '' ) { 01136 # Must have been all whitespace to start with. 01137 return '_'; 01138 } else { 01139 return $id; 01140 } 01141 } 01142 01143 # HTML4-style escaping 01144 static $replace = array( 01145 '%3A' => ':', 01146 '%' => '.' 01147 ); 01148 01149 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); 01150 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); 01151 01152 if ( !preg_match( '/^[a-zA-Z]/', $id ) 01153 && !in_array( 'noninitial', $options ) ) { 01154 // Initial character must be a letter! 01155 $id = "x$id"; 01156 } 01157 return $id; 01158 } 01159 01171 static function escapeClass( $class ) { 01172 // Convert ugly stuff to underscores and kill underscores in ugly places 01173 return rtrim(preg_replace( 01174 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'), 01175 '_', 01176 $class ), '_'); 01177 } 01178 01186 static function escapeHtmlAllowEntities( $html ) { 01187 $html = Sanitizer::decodeCharReferences( $html ); 01188 # It seems wise to escape ' as well as ", as a matter of course. Can't 01189 # hurt. 01190 $html = htmlspecialchars( $html, ENT_QUOTES ); 01191 return $html; 01192 } 01193 01199 private static function armorLinksCallback( $matches ) { 01200 return str_replace( ':', ':', $matches[1] ); 01201 } 01202 01211 public static function decodeTagAttributes( $text ) { 01212 if( trim( $text ) == '' ) { 01213 return array(); 01214 } 01215 01216 $attribs = array(); 01217 $pairs = array(); 01218 if( !preg_match_all( 01219 self::getAttribsRegex(), 01220 $text, 01221 $pairs, 01222 PREG_SET_ORDER ) ) { 01223 return $attribs; 01224 } 01225 01226 foreach( $pairs as $set ) { 01227 $attribute = strtolower( $set[1] ); 01228 $value = Sanitizer::getTagAttributeCallback( $set ); 01229 01230 // Normalize whitespace 01231 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); 01232 $value = trim( $value ); 01233 01234 // Decode character references 01235 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); 01236 } 01237 return $attribs; 01238 } 01239 01247 private static function getTagAttributeCallback( $set ) { 01248 if( isset( $set[6] ) ) { 01249 # Illegal #XXXXXX color with no quotes. 01250 return $set[6]; 01251 } elseif( isset( $set[5] ) ) { 01252 # No quotes. 01253 return $set[5]; 01254 } elseif( isset( $set[4] ) ) { 01255 # Single-quoted 01256 return $set[4]; 01257 } elseif( isset( $set[3] ) ) { 01258 # Double-quoted 01259 return $set[3]; 01260 } elseif( !isset( $set[2] ) ) { 01261 # In XHTML, attributes must have a value. 01262 # For 'reduced' form, return explicitly the attribute name here. 01263 return $set[1]; 01264 } else { 01265 throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); 01266 } 01267 } 01268 01280 private static function normalizeAttributeValue( $text ) { 01281 return str_replace( '"', '"', 01282 self::normalizeWhitespace( 01283 Sanitizer::normalizeCharReferences( $text ) ) ); 01284 } 01285 01290 private static function normalizeWhitespace( $text ) { 01291 return preg_replace( 01292 '/\r\n|[\x20\x0d\x0a\x09]/', 01293 ' ', 01294 $text ); 01295 } 01296 01305 static function normalizeSectionNameWhitespace( $section ) { 01306 return trim( preg_replace( '/[ _]+/', ' ', $section ) ); 01307 } 01308 01324 static function normalizeCharReferences( $text ) { 01325 return preg_replace_callback( 01326 self::CHAR_REFS_REGEX, 01327 array( 'Sanitizer', 'normalizeCharReferencesCallback' ), 01328 $text ); 01329 } 01334 static function normalizeCharReferencesCallback( $matches ) { 01335 $ret = null; 01336 if( $matches[1] != '' ) { 01337 $ret = Sanitizer::normalizeEntity( $matches[1] ); 01338 } elseif( $matches[2] != '' ) { 01339 $ret = Sanitizer::decCharReference( $matches[2] ); 01340 } elseif( $matches[3] != '' ) { 01341 $ret = Sanitizer::hexCharReference( $matches[3] ); 01342 } 01343 if( is_null( $ret ) ) { 01344 return htmlspecialchars( $matches[0] ); 01345 } else { 01346 return $ret; 01347 } 01348 } 01349 01360 static function normalizeEntity( $name ) { 01361 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01362 return '&' . self::$htmlEntityAliases[$name] . ';'; 01363 } elseif ( in_array( $name, 01364 array( 'lt', 'gt', 'amp', 'quot' ) ) ) { 01365 return "&$name;"; 01366 } elseif ( isset( self::$htmlEntities[$name] ) ) { 01367 return '&#' . self::$htmlEntities[$name] . ';'; 01368 } else { 01369 return "&$name;"; 01370 } 01371 } 01372 01377 static function decCharReference( $codepoint ) { 01378 $point = intval( $codepoint ); 01379 if( Sanitizer::validateCodepoint( $point ) ) { 01380 return sprintf( '&#%d;', $point ); 01381 } else { 01382 return null; 01383 } 01384 } 01385 01390 static function hexCharReference( $codepoint ) { 01391 $point = hexdec( $codepoint ); 01392 if( Sanitizer::validateCodepoint( $point ) ) { 01393 return sprintf( '&#x%x;', $point ); 01394 } else { 01395 return null; 01396 } 01397 } 01398 01404 private static function validateCodepoint( $codepoint ) { 01405 return ($codepoint == 0x09) 01406 || ($codepoint == 0x0a) 01407 || ($codepoint == 0x0d) 01408 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) 01409 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) 01410 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); 01411 } 01412 01420 public static function decodeCharReferences( $text ) { 01421 return preg_replace_callback( 01422 self::CHAR_REFS_REGEX, 01423 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01424 $text ); 01425 } 01426 01437 public static function decodeCharReferencesAndNormalize( $text ) { 01438 global $wgContLang; 01439 $text = preg_replace_callback( 01440 self::CHAR_REFS_REGEX, 01441 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01442 $text, /* limit */ -1, $count ); 01443 01444 if ( $count ) { 01445 return $wgContLang->normalize( $text ); 01446 } else { 01447 return $text; 01448 } 01449 } 01450 01455 static function decodeCharReferencesCallback( $matches ) { 01456 if( $matches[1] != '' ) { 01457 return Sanitizer::decodeEntity( $matches[1] ); 01458 } elseif( $matches[2] != '' ) { 01459 return Sanitizer::decodeChar( intval( $matches[2] ) ); 01460 } elseif( $matches[3] != '' ) { 01461 return Sanitizer::decodeChar( hexdec( $matches[3] ) ); 01462 } 01463 # Last case should be an ampersand by itself 01464 return $matches[0]; 01465 } 01466 01474 static function decodeChar( $codepoint ) { 01475 if( Sanitizer::validateCodepoint( $codepoint ) ) { 01476 return codepointToUtf8( $codepoint ); 01477 } else { 01478 return UTF8_REPLACEMENT; 01479 } 01480 } 01481 01490 static function decodeEntity( $name ) { 01491 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01492 $name = self::$htmlEntityAliases[$name]; 01493 } 01494 if( isset( self::$htmlEntities[$name] ) ) { 01495 return codepointToUtf8( self::$htmlEntities[$name] ); 01496 } else { 01497 return "&$name;"; 01498 } 01499 } 01500 01507 static function attributeWhitelist( $element ) { 01508 static $list; 01509 if( !isset( $list ) ) { 01510 $list = Sanitizer::setupAttributeWhitelist(); 01511 } 01512 return isset( $list[$element] ) 01513 ? $list[$element] 01514 : array(); 01515 } 01516 01522 static function setupAttributeWhitelist() { 01523 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes; 01524 01525 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); 01526 01527 if ( $wgAllowRdfaAttributes ) { 01528 #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 01529 $common = array_merge( $common, array( 01530 'about', 'property', 'resource', 'datatype', 'typeof', 01531 ) ); 01532 } 01533 01534 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) { 01535 # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model 01536 $common = array_merge( $common, array( 01537 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype' 01538 ) ); 01539 } 01540 01541 $block = array_merge( $common, array( 'align' ) ); 01542 $tablealign = array( 'align', 'char', 'charoff', 'valign' ); 01543 $tablecell = array( 'abbr', 01544 'axis', 01545 'headers', 01546 'scope', 01547 'rowspan', 01548 'colspan', 01549 'nowrap', # deprecated 01550 'width', # deprecated 01551 'height', # deprecated 01552 'bgcolor' # deprecated 01553 ); 01554 01555 # Numbers refer to sections in HTML 4.01 standard describing the element. 01556 # See: http://www.w3.org/TR/html4/ 01557 $whitelist = array ( 01558 # 7.5.4 01559 'div' => $block, 01560 'center' => $common, # deprecated 01561 'span' => $block, # ?? 01562 01563 # 7.5.5 01564 'h1' => $block, 01565 'h2' => $block, 01566 'h3' => $block, 01567 'h4' => $block, 01568 'h5' => $block, 01569 'h6' => $block, 01570 01571 # 7.5.6 01572 # address 01573 01574 # 8.2.4 01575 # bdo 01576 01577 # 9.2.1 01578 'em' => $common, 01579 'strong' => $common, 01580 'cite' => $common, 01581 'dfn' => $common, 01582 'code' => $common, 01583 'samp' => $common, 01584 'kbd' => $common, 01585 'var' => $common, 01586 'abbr' => $common, 01587 # acronym 01588 01589 # 9.2.2 01590 'blockquote' => array_merge( $common, array( 'cite' ) ), 01591 # q 01592 01593 # 9.2.3 01594 'sub' => $common, 01595 'sup' => $common, 01596 01597 # 9.3.1 01598 'p' => $block, 01599 01600 # 9.3.2 01601 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), 01602 01603 # 9.3.4 01604 'pre' => array_merge( $common, array( 'width' ) ), 01605 01606 # 9.4 01607 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), 01608 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), 01609 01610 # 10.2 01611 'ul' => array_merge( $common, array( 'type' ) ), 01612 'ol' => array_merge( $common, array( 'type', 'start' ) ), 01613 'li' => array_merge( $common, array( 'type', 'value' ) ), 01614 01615 # 10.3 01616 'dl' => $common, 01617 'dd' => $common, 01618 'dt' => $common, 01619 01620 # 11.2.1 01621 'table' => array_merge( $common, 01622 array( 'summary', 'width', 'border', 'frame', 01623 'rules', 'cellspacing', 'cellpadding', 01624 'align', 'bgcolor', 01625 ) ), 01626 01627 # 11.2.2 01628 'caption' => array_merge( $common, array( 'align' ) ), 01629 01630 # 11.2.3 01631 'thead' => array_merge( $common, $tablealign ), 01632 'tfoot' => array_merge( $common, $tablealign ), 01633 'tbody' => array_merge( $common, $tablealign ), 01634 01635 # 11.2.4 01636 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), 01637 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), 01638 01639 # 11.2.5 01640 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), 01641 01642 # 11.2.6 01643 'td' => array_merge( $common, $tablecell, $tablealign ), 01644 'th' => array_merge( $common, $tablecell, $tablealign ), 01645 01646 # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object 01647 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa 01648 01649 # 13.2 01650 # Not usually allowed, but may be used for extension-style hooks 01651 # such as <math> when it is rasterized, or if $wgAllowImageTag is 01652 # true 01653 'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ), 01654 01655 # 15.2.1 01656 'tt' => $common, 01657 'b' => $common, 01658 'i' => $common, 01659 'big' => $common, 01660 'small' => $common, 01661 'strike' => $common, 01662 's' => $common, 01663 'u' => $common, 01664 01665 # 15.2.2 01666 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), 01667 # basefont 01668 01669 # 15.3 01670 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), 01671 01672 # XHTML Ruby annotation text module, simple ruby only. 01673 # http://www.w3c.org/TR/ruby/ 01674 'ruby' => $common, 01675 # rbc 01676 # rtc 01677 'rb' => $common, 01678 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), 01679 'rp' => $common, 01680 01681 # MathML root element, where used for extensions 01682 # 'title' may not be 100% valid here; it's XHTML 01683 # http://www.w3.org/TR/REC-MathML/ 01684 'math' => array( 'class', 'style', 'id', 'title' ), 01685 ); 01686 return $whitelist; 01687 } 01688 01699 static function stripAllTags( $text ) { 01700 # Actual <tags> 01701 $text = StringUtils::delimiterReplace( '<', '>', '', $text ); 01702 01703 # Normalize &entities and whitespace 01704 $text = self::decodeCharReferences( $text ); 01705 $text = self::normalizeWhitespace( $text ); 01706 01707 return $text; 01708 } 01709 01719 static function hackDocType() { 01720 $out = "<!DOCTYPE html [\n"; 01721 foreach( self::$htmlEntities as $entity => $codepoint ) { 01722 $out .= "<!ENTITY $entity \"&#$codepoint;\">"; 01723 } 01724 $out .= "]>\n"; 01725 return $out; 01726 } 01727 01732 static function cleanUrl( $url ) { 01733 # Normalize any HTML entities in input. They will be 01734 # re-escaped by makeExternalLink(). 01735 $url = Sanitizer::decodeCharReferences( $url ); 01736 01737 # Escape any control characters introduced by the above step 01738 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', 01739 array( __CLASS__, 'cleanUrlCallback' ), $url ); 01740 01741 # Validate hostname portion 01742 $matches = array(); 01743 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { 01744 list( /* $whole */, $protocol, $host, $rest ) = $matches; 01745 01746 // Characters that will be ignored in IDNs. 01747 // http://tools.ietf.org/html/3454#section-3.1 01748 // Strip them before further processing so blacklists and such work. 01749 $strip = "/ 01750 \\s| # general whitespace 01751 \xc2\xad| # 00ad SOFT HYPHEN 01752 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 01753 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 01754 \xe2\x81\xa0| # 2060 WORD JOINER 01755 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 01756 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER 01757 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 01758 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 01759 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 01760 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 01761 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 01762 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16 01763 /xuD"; 01764 01765 $host = preg_replace( $strip, '', $host ); 01766 01767 // @todo FIXME: Validate hostnames here 01768 01769 return $protocol . $host . $rest; 01770 } else { 01771 return $url; 01772 } 01773 } 01774 01779 static function cleanUrlCallback( $matches ) { 01780 return urlencode( $matches[0] ); 01781 } 01782 01811 public static function validateEmail( $addr ) { 01812 $result = null; 01813 if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) { 01814 return $result; 01815 } 01816 01817 // Please note strings below are enclosed in brackets [], this make the 01818 // hyphen "-" a range indicator. Hence it is double backslashed below. 01819 // See bug 26948 01820 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ; 01821 $rfc1034_ldh_str = "a-z0-9\\-" ; 01822 01823 $HTML5_email_regexp = "/ 01824 ^ # start of string 01825 [$rfc5322_atext\\.]+ # user part which is liberal :p 01826 @ # 'apostrophe' 01827 [$rfc1034_ldh_str]+ # First domain part 01828 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot 01829 $ # End of string 01830 /ix" ; // case Insensitive, eXtended 01831 01832 return (bool) preg_match( $HTML5_email_regexp, $addr ); 01833 } 01834 }