MediaWiki
REL1_24
|
00001 <?php 00031 class Sanitizer { 00036 const CHAR_REFS_REGEX = 00037 '/&([A-Za-z0-9\x80-\xff]+); 00038 |&\#([0-9]+); 00039 |&\#[xX]([0-9A-Fa-f]+); 00040 |(&)/x'; 00041 00050 const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; 00051 const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; 00052 00058 private static $htmlEntities = array( 00059 'Aacute' => 193, 00060 'aacute' => 225, 00061 'Acirc' => 194, 00062 'acirc' => 226, 00063 'acute' => 180, 00064 'AElig' => 198, 00065 'aelig' => 230, 00066 'Agrave' => 192, 00067 'agrave' => 224, 00068 'alefsym' => 8501, 00069 'Alpha' => 913, 00070 'alpha' => 945, 00071 'amp' => 38, 00072 'and' => 8743, 00073 'ang' => 8736, 00074 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. 00075 'Aring' => 197, 00076 'aring' => 229, 00077 'asymp' => 8776, 00078 'Atilde' => 195, 00079 'atilde' => 227, 00080 'Auml' => 196, 00081 'auml' => 228, 00082 'bdquo' => 8222, 00083 'Beta' => 914, 00084 'beta' => 946, 00085 'brvbar' => 166, 00086 'bull' => 8226, 00087 'cap' => 8745, 00088 'Ccedil' => 199, 00089 'ccedil' => 231, 00090 'cedil' => 184, 00091 'cent' => 162, 00092 'Chi' => 935, 00093 'chi' => 967, 00094 'circ' => 710, 00095 'clubs' => 9827, 00096 'cong' => 8773, 00097 'copy' => 169, 00098 'crarr' => 8629, 00099 'cup' => 8746, 00100 'curren' => 164, 00101 'dagger' => 8224, 00102 'Dagger' => 8225, 00103 'darr' => 8595, 00104 'dArr' => 8659, 00105 'deg' => 176, 00106 'Delta' => 916, 00107 'delta' => 948, 00108 'diams' => 9830, 00109 'divide' => 247, 00110 'Eacute' => 201, 00111 'eacute' => 233, 00112 'Ecirc' => 202, 00113 'ecirc' => 234, 00114 'Egrave' => 200, 00115 'egrave' => 232, 00116 'empty' => 8709, 00117 'emsp' => 8195, 00118 'ensp' => 8194, 00119 'Epsilon' => 917, 00120 'epsilon' => 949, 00121 'equiv' => 8801, 00122 'Eta' => 919, 00123 'eta' => 951, 00124 'ETH' => 208, 00125 'eth' => 240, 00126 'Euml' => 203, 00127 'euml' => 235, 00128 'euro' => 8364, 00129 'exist' => 8707, 00130 'fnof' => 402, 00131 'forall' => 8704, 00132 'frac12' => 189, 00133 'frac14' => 188, 00134 'frac34' => 190, 00135 'frasl' => 8260, 00136 'Gamma' => 915, 00137 'gamma' => 947, 00138 'ge' => 8805, 00139 'gt' => 62, 00140 'harr' => 8596, 00141 'hArr' => 8660, 00142 'hearts' => 9829, 00143 'hellip' => 8230, 00144 'Iacute' => 205, 00145 'iacute' => 237, 00146 'Icirc' => 206, 00147 'icirc' => 238, 00148 'iexcl' => 161, 00149 'Igrave' => 204, 00150 'igrave' => 236, 00151 'image' => 8465, 00152 'infin' => 8734, 00153 'int' => 8747, 00154 'Iota' => 921, 00155 'iota' => 953, 00156 'iquest' => 191, 00157 'isin' => 8712, 00158 'Iuml' => 207, 00159 'iuml' => 239, 00160 'Kappa' => 922, 00161 'kappa' => 954, 00162 'Lambda' => 923, 00163 'lambda' => 955, 00164 'lang' => 9001, 00165 'laquo' => 171, 00166 'larr' => 8592, 00167 'lArr' => 8656, 00168 'lceil' => 8968, 00169 'ldquo' => 8220, 00170 'le' => 8804, 00171 'lfloor' => 8970, 00172 'lowast' => 8727, 00173 'loz' => 9674, 00174 'lrm' => 8206, 00175 'lsaquo' => 8249, 00176 'lsquo' => 8216, 00177 'lt' => 60, 00178 'macr' => 175, 00179 'mdash' => 8212, 00180 'micro' => 181, 00181 'middot' => 183, 00182 'minus' => 8722, 00183 'Mu' => 924, 00184 'mu' => 956, 00185 'nabla' => 8711, 00186 'nbsp' => 160, 00187 'ndash' => 8211, 00188 'ne' => 8800, 00189 'ni' => 8715, 00190 'not' => 172, 00191 'notin' => 8713, 00192 'nsub' => 8836, 00193 'Ntilde' => 209, 00194 'ntilde' => 241, 00195 'Nu' => 925, 00196 'nu' => 957, 00197 'Oacute' => 211, 00198 'oacute' => 243, 00199 'Ocirc' => 212, 00200 'ocirc' => 244, 00201 'OElig' => 338, 00202 'oelig' => 339, 00203 'Ograve' => 210, 00204 'ograve' => 242, 00205 'oline' => 8254, 00206 'Omega' => 937, 00207 'omega' => 969, 00208 'Omicron' => 927, 00209 'omicron' => 959, 00210 'oplus' => 8853, 00211 'or' => 8744, 00212 'ordf' => 170, 00213 'ordm' => 186, 00214 'Oslash' => 216, 00215 'oslash' => 248, 00216 'Otilde' => 213, 00217 'otilde' => 245, 00218 'otimes' => 8855, 00219 'Ouml' => 214, 00220 'ouml' => 246, 00221 'para' => 182, 00222 'part' => 8706, 00223 'permil' => 8240, 00224 'perp' => 8869, 00225 'Phi' => 934, 00226 'phi' => 966, 00227 'Pi' => 928, 00228 'pi' => 960, 00229 'piv' => 982, 00230 'plusmn' => 177, 00231 'pound' => 163, 00232 'prime' => 8242, 00233 'Prime' => 8243, 00234 'prod' => 8719, 00235 'prop' => 8733, 00236 'Psi' => 936, 00237 'psi' => 968, 00238 'quot' => 34, 00239 'radic' => 8730, 00240 'rang' => 9002, 00241 'raquo' => 187, 00242 'rarr' => 8594, 00243 'rArr' => 8658, 00244 'rceil' => 8969, 00245 'rdquo' => 8221, 00246 'real' => 8476, 00247 'reg' => 174, 00248 'rfloor' => 8971, 00249 'Rho' => 929, 00250 'rho' => 961, 00251 'rlm' => 8207, 00252 'rsaquo' => 8250, 00253 'rsquo' => 8217, 00254 'sbquo' => 8218, 00255 'Scaron' => 352, 00256 'scaron' => 353, 00257 'sdot' => 8901, 00258 'sect' => 167, 00259 'shy' => 173, 00260 'Sigma' => 931, 00261 'sigma' => 963, 00262 'sigmaf' => 962, 00263 'sim' => 8764, 00264 'spades' => 9824, 00265 'sub' => 8834, 00266 'sube' => 8838, 00267 'sum' => 8721, 00268 'sup' => 8835, 00269 'sup1' => 185, 00270 'sup2' => 178, 00271 'sup3' => 179, 00272 'supe' => 8839, 00273 'szlig' => 223, 00274 'Tau' => 932, 00275 'tau' => 964, 00276 'there4' => 8756, 00277 'Theta' => 920, 00278 'theta' => 952, 00279 'thetasym' => 977, 00280 'thinsp' => 8201, 00281 'THORN' => 222, 00282 'thorn' => 254, 00283 'tilde' => 732, 00284 'times' => 215, 00285 'trade' => 8482, 00286 'Uacute' => 218, 00287 'uacute' => 250, 00288 'uarr' => 8593, 00289 'uArr' => 8657, 00290 'Ucirc' => 219, 00291 'ucirc' => 251, 00292 'Ugrave' => 217, 00293 'ugrave' => 249, 00294 'uml' => 168, 00295 'upsih' => 978, 00296 'Upsilon' => 933, 00297 'upsilon' => 965, 00298 'Uuml' => 220, 00299 'uuml' => 252, 00300 'weierp' => 8472, 00301 'Xi' => 926, 00302 'xi' => 958, 00303 'Yacute' => 221, 00304 'yacute' => 253, 00305 'yen' => 165, 00306 'Yuml' => 376, 00307 'yuml' => 255, 00308 'Zeta' => 918, 00309 'zeta' => 950, 00310 'zwj' => 8205, 00311 'zwnj' => 8204 00312 ); 00313 00317 private static $htmlEntityAliases = array( 00318 'רלמ' => 'rlm', 00319 'رلم' => 'rlm', 00320 ); 00321 00325 private static $attribsRegex; 00326 00333 static function getAttribsRegex() { 00334 if ( self::$attribsRegex === null ) { 00335 $attribFirst = '[:A-Z_a-z0-9]'; 00336 $attrib = '[:A-Z_a-z-.0-9]'; 00337 $space = '[\x09\x0a\x0d\x20]'; 00338 self::$attribsRegex = 00339 "/(?:^|$space)({$attribFirst}{$attrib}*) 00340 ($space*=$space* 00341 (?: 00342 # The attribute value: quoted or alone 00343 \"([^<\"]*)\" 00344 | '([^<']*)' 00345 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) 00346 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of 00347 # colors are specified like this. 00348 # We'll be normalizing it. 00349 ) 00350 )?(?=$space|\$)/sx"; 00351 } 00352 return self::$attribsRegex; 00353 } 00354 00367 static function removeHTMLtags( $text, $processCallback = null, 00368 $args = array(), $extratags = array(), $removetags = array() 00369 ) { 00370 global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag; 00371 00372 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, 00373 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; 00374 00375 wfProfileIn( __METHOD__ ); 00376 00377 // Base our staticInitialised variable off of the global config state so that if the globals 00378 // are changed (like in the screwed up test system) we will re-initialise the settings. 00379 $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) ); 00380 if ( !$staticInitialised || $staticInitialised != $globalContext ) { 00381 00382 $htmlpairsStatic = array( # Tags that must be closed 00383 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 00384 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 00385 'strike', 'strong', 'tt', 'var', 'div', 'center', 00386 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 00387 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', 00388 'kbd', 'samp', 'data', 'time', 'mark' 00389 ); 00390 $htmlsingle = array( 00391 'br', 'wbr', 'hr', 'li', 'dt', 'dd' 00392 ); 00393 $htmlsingleonly = array( # Elements that cannot have close tags 00394 'br', 'wbr', 'hr' 00395 ); 00396 if ( $wgAllowMicrodataAttributes ) { 00397 $htmlsingle[] = $htmlsingleonly[] = 'meta'; 00398 $htmlsingle[] = $htmlsingleonly[] = 'link'; 00399 } 00400 $htmlnest = array( # Tags that can be nested--?? 00401 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 00402 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', 00403 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' 00404 ); 00405 $tabletags = array( # Can only appear inside table, we will close them 00406 'td', 'th', 'tr', 00407 ); 00408 $htmllist = array( # Tags used by list 00409 'ul', 'ol', 00410 ); 00411 $listtags = array( # Tags that can appear in a list 00412 'li', 00413 ); 00414 00415 if ( $wgAllowImageTag ) { 00416 $htmlsingle[] = 'img'; 00417 $htmlsingleonly[] = 'img'; 00418 } 00419 00420 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); 00421 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); 00422 00423 # Convert them all to hashtables for faster lookup 00424 $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 00425 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ); 00426 foreach ( $vars as $var ) { 00427 $$var = array_flip( $$var ); 00428 } 00429 $staticInitialised = $globalContext; 00430 } 00431 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays 00432 $extratags = array_flip( $extratags ); 00433 $removetags = array_flip( $removetags ); 00434 $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); 00435 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); 00436 00437 # Remove HTML comments 00438 $text = Sanitizer::removeHTMLcomments( $text ); 00439 $bits = explode( '<', $text ); 00440 $text = str_replace( '>', '>', array_shift( $bits ) ); 00441 if ( !$wgUseTidy ) { 00442 $tagstack = $tablestack = array(); 00443 foreach ( $bits as $x ) { 00444 $regs = array(); 00445 # $slash: Does the current element start with a '/'? 00446 # $t: Current element name 00447 # $params: String between element name and > 00448 # $brace: Ending '>' or '/>' 00449 # $rest: Everything until the next element of $bits 00450 if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { 00451 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00452 } else { 00453 $slash = $t = $params = $brace = $rest = null; 00454 } 00455 00456 $badtag = false; 00457 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00458 # Check our stack 00459 if ( $slash && isset( $htmlsingleonly[$t] ) ) { 00460 $badtag = true; 00461 } elseif ( $slash ) { 00462 # Closing a tag... is it the one we just opened? 00463 wfSuppressWarnings(); 00464 $ot = array_pop( $tagstack ); 00465 wfRestoreWarnings(); 00466 00467 if ( $ot != $t ) { 00468 if ( isset( $htmlsingleallowed[$ot] ) ) { 00469 # Pop all elements with an optional close tag 00470 # and see if we find a match below them 00471 $optstack = array(); 00472 array_push( $optstack, $ot ); 00473 wfSuppressWarnings(); 00474 $ot = array_pop( $tagstack ); 00475 wfRestoreWarnings(); 00476 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { 00477 array_push( $optstack, $ot ); 00478 wfSuppressWarnings(); 00479 $ot = array_pop( $tagstack ); 00480 wfRestoreWarnings(); 00481 } 00482 if ( $t != $ot ) { 00483 # No match. Push the optional elements back again 00484 $badtag = true; 00485 wfSuppressWarnings(); 00486 $ot = array_pop( $optstack ); 00487 wfRestoreWarnings(); 00488 while ( $ot ) { 00489 array_push( $tagstack, $ot ); 00490 wfSuppressWarnings(); 00491 $ot = array_pop( $optstack ); 00492 wfRestoreWarnings(); 00493 } 00494 } 00495 } else { 00496 wfSuppressWarnings(); 00497 array_push( $tagstack, $ot ); 00498 wfRestoreWarnings(); 00499 00500 # <li> can be nested in <ul> or <ol>, skip those cases: 00501 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) { 00502 $badtag = true; 00503 } 00504 } 00505 } else { 00506 if ( $t == 'table' ) { 00507 $tagstack = array_pop( $tablestack ); 00508 } 00509 } 00510 $newparams = ''; 00511 } else { 00512 # Keep track for later 00513 if ( isset( $tabletags[$t] ) && 00514 !in_array( 'table', $tagstack ) ) { 00515 $badtag = true; 00516 } elseif ( in_array( $t, $tagstack ) && 00517 !isset( $htmlnest[$t] ) ) { 00518 $badtag = true; 00519 # Is it a self closed htmlpair ? (bug 5487) 00520 } elseif ( $brace == '/>' && 00521 isset( $htmlpairs[$t] ) ) { 00522 $badtag = true; 00523 } elseif ( isset( $htmlsingleonly[$t] ) ) { 00524 # Hack to force empty tag for unclosable elements 00525 $brace = '/>'; 00526 } elseif ( isset( $htmlsingle[$t] ) ) { 00527 # Hack to not close $htmlsingle tags 00528 $brace = null; 00529 # Still need to push this optionally-closed tag to 00530 # the tag stack so that we can match end tags 00531 # instead of marking them as bad. 00532 array_push( $tagstack, $t ); 00533 } elseif ( isset( $tabletags[$t] ) 00534 && in_array( $t, $tagstack ) ) { 00535 // New table tag but forgot to close the previous one 00536 $text .= "</$t>"; 00537 } else { 00538 if ( $t == 'table' ) { 00539 array_push( $tablestack, $tagstack ); 00540 $tagstack = array(); 00541 } 00542 array_push( $tagstack, $t ); 00543 } 00544 00545 # Replace any variables or template parameters with 00546 # plaintext results. 00547 if ( is_callable( $processCallback ) ) { 00548 call_user_func_array( $processCallback, array( &$params, $args ) ); 00549 } 00550 00551 if ( !Sanitizer::validateTag( $params, $t ) ) { 00552 $badtag = true; 00553 } 00554 00555 # Strip non-approved attributes from the tag 00556 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00557 } 00558 if ( !$badtag ) { 00559 $rest = str_replace( '>', '>', $rest ); 00560 $close = ( $brace == '/>' && !$slash ) ? ' /' : ''; 00561 $text .= "<$slash$t$newparams$close>$rest"; 00562 continue; 00563 } 00564 } 00565 $text .= '<' . str_replace( '>', '>', $x ); 00566 } 00567 # Close off any remaining tags 00568 while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) { 00569 $text .= "</$t>\n"; 00570 if ( $t == 'table' ) { 00571 $tagstack = array_pop( $tablestack ); 00572 } 00573 } 00574 } else { 00575 # this might be possible using tidy itself 00576 foreach ( $bits as $x ) { 00577 preg_match( 00578 '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', 00579 $x, 00580 $regs 00581 ); 00582 00583 wfSuppressWarnings(); 00584 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00585 wfRestoreWarnings(); 00586 00587 $badtag = false; 00588 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00589 if ( is_callable( $processCallback ) ) { 00590 call_user_func_array( $processCallback, array( &$params, $args ) ); 00591 } 00592 00593 if ( !Sanitizer::validateTag( $params, $t ) ) { 00594 $badtag = true; 00595 } 00596 00597 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00598 if ( !$badtag ) { 00599 $rest = str_replace( '>', '>', $rest ); 00600 $text .= "<$slash$t$newparams$brace$rest"; 00601 continue; 00602 } 00603 } 00604 $text .= '<' . str_replace( '>', '>', $x ); 00605 } 00606 } 00607 wfProfileOut( __METHOD__ ); 00608 return $text; 00609 } 00610 00621 static function removeHTMLcomments( $text ) { 00622 wfProfileIn( __METHOD__ ); 00623 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) { 00624 $end = strpos( $text, '-->', $start + 4 ); 00625 if ( $end === false ) { 00626 # Unterminated comment; bail out 00627 break; 00628 } 00629 00630 $end += 3; 00631 00632 # Trim space and newline if the comment is both 00633 # preceded and followed by a newline 00634 $spaceStart = max( $start - 1, 0 ); 00635 $spaceLen = $end - $spaceStart; 00636 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) { 00637 $spaceStart--; 00638 $spaceLen++; 00639 } 00640 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) { 00641 $spaceLen++; 00642 } 00643 if ( substr( $text, $spaceStart, 1 ) === "\n" 00644 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) { 00645 # Remove the comment, leading and trailing 00646 # spaces, and leave only one newline. 00647 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 ); 00648 } else { 00649 # Remove just the comment. 00650 $text = substr_replace( $text, '', $start, $end - $start ); 00651 } 00652 } 00653 wfProfileOut( __METHOD__ ); 00654 return $text; 00655 } 00656 00669 static function validateTag( $params, $element ) { 00670 $params = Sanitizer::decodeTagAttributes( $params ); 00671 00672 if ( $element == 'meta' || $element == 'link' ) { 00673 if ( !isset( $params['itemprop'] ) ) { 00674 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content 00675 return false; 00676 } 00677 if ( $element == 'meta' && !isset( $params['content'] ) ) { 00678 // <meta> must have a content="" for the itemprop 00679 return false; 00680 } 00681 if ( $element == 'link' && !isset( $params['href'] ) ) { 00682 // <link> must have an associated href="" 00683 return false; 00684 } 00685 } 00686 00687 return true; 00688 } 00689 00705 static function validateTagAttributes( $attribs, $element ) { 00706 return Sanitizer::validateAttributes( $attribs, 00707 Sanitizer::attributeWhitelist( $element ) ); 00708 } 00709 00725 static function validateAttributes( $attribs, $whitelist ) { 00726 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes; 00727 00728 $whitelist = array_flip( $whitelist ); 00729 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; 00730 00731 $out = array(); 00732 foreach ( $attribs as $attribute => $value ) { 00733 #allow XML namespace declaration if RDFa is enabled 00734 if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { 00735 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00736 $out[$attribute] = $value; 00737 } 00738 00739 continue; 00740 } 00741 00742 # Allow any attribute beginning with "data-" 00743 if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) { 00744 continue; 00745 } 00746 00747 # Strip javascript "expression" from stylesheets. 00748 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp 00749 if ( $attribute == 'style' ) { 00750 $value = Sanitizer::checkCss( $value ); 00751 } 00752 00753 if ( $attribute === 'id' ) { 00754 $value = Sanitizer::escapeId( $value, 'noninitial' ); 00755 } 00756 00757 # WAI-ARIA 00758 # http://www.w3.org/TR/wai-aria/ 00759 # http://www.whatwg.org/html/elements.html#wai-aria 00760 # For now we only support role="presentation" until we work out what roles should be 00761 # usable by content and we ensure that our code explicitly rejects patterns that 00762 # violate HTML5's ARIA restrictions. 00763 if ( $attribute === 'role' && $value !== 'presentation' ) { 00764 continue; 00765 } 00766 00767 // RDFa and microdata properties allow URLs, URIs and/or CURIs. 00768 // Check them for sanity. 00769 if ( $attribute === 'rel' || $attribute === 'rev' 00770 # RDFa 00771 || $attribute === 'about' || $attribute === 'property' 00772 || $attribute === 'resource' || $attribute === 'datatype' 00773 || $attribute === 'typeof' 00774 # HTML5 microdata 00775 || $attribute === 'itemid' || $attribute === 'itemprop' 00776 || $attribute === 'itemref' || $attribute === 'itemscope' 00777 || $attribute === 'itemtype' 00778 ) { 00779 //Paranoia. Allow "simple" values but suppress javascript 00780 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00781 continue; 00782 } 00783 } 00784 00785 # NOTE: even though elements using href/src are not allowed directly, supply 00786 # validation code that can be used by tag hook handlers, etc 00787 if ( $attribute === 'href' || $attribute === 'src' ) { 00788 if ( !preg_match( $hrefExp, $value ) ) { 00789 continue; //drop any href or src attributes not using an allowed protocol. 00790 // NOTE: this also drops all relative URLs 00791 } 00792 } 00793 00794 // If this attribute was previously set, override it. 00795 // Output should only have one attribute of each name. 00796 $out[$attribute] = $value; 00797 } 00798 00799 if ( $wgAllowMicrodataAttributes ) { 00800 # itemtype, itemid, itemref don't make sense without itemscope 00801 if ( !array_key_exists( 'itemscope', $out ) ) { 00802 unset( $out['itemtype'] ); 00803 unset( $out['itemid'] ); 00804 unset( $out['itemref'] ); 00805 } 00806 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. 00807 } 00808 return $out; 00809 } 00810 00821 static function mergeAttributes( $a, $b ) { 00822 $out = array_merge( $a, $b ); 00823 if ( isset( $a['class'] ) && isset( $b['class'] ) 00824 && is_string( $a['class'] ) && is_string( $b['class'] ) 00825 && $a['class'] !== $b['class'] 00826 ) { 00827 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", 00828 -1, PREG_SPLIT_NO_EMPTY ); 00829 $out['class'] = implode( ' ', array_unique( $classes ) ); 00830 } 00831 return $out; 00832 } 00833 00843 public static function normalizeCss( $value ) { 00844 00845 // Decode character references like { 00846 $value = Sanitizer::decodeCharReferences( $value ); 00847 00848 // Decode escape sequences and line continuation 00849 // See the grammar in the CSS 2 spec, appendix D. 00850 // This has to be done AFTER decoding character references. 00851 // This means it isn't possible for this function to return 00852 // unsanitized escape sequences. It is possible to manufacture 00853 // input that contains character references that decode to 00854 // escape sequences that decode to character references, but 00855 // it's OK for the return value to contain character references 00856 // because the caller is supposed to escape those anyway. 00857 static $decodeRegex; 00858 if ( !$decodeRegex ) { 00859 $space = '[\\x20\\t\\r\\n\\f]'; 00860 $nl = '(?:\\n|\\r\\n|\\r|\\f)'; 00861 $backslash = '\\\\'; 00862 $decodeRegex = "/ $backslash 00863 (?: 00864 ($nl) | # 1. Line continuation 00865 ([0-9A-Fa-f]{1,6})$space? | # 2. character number 00866 (.) | # 3. backslash cancelling special meaning 00867 () | # 4. backslash at end of string 00868 )/xu"; 00869 } 00870 $value = preg_replace_callback( $decodeRegex, 00871 array( __CLASS__, 'cssDecodeCallback' ), $value ); 00872 00873 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii 00874 $value = preg_replace_callback( 00875 '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088) 00876 function ( $matches ) { 00877 $cp = utf8ToCodepoint( $matches[0] ); 00878 if ( $cp === false ) { 00879 return ''; 00880 } 00881 return chr( $cp - 65248 ); // ASCII range \x21-\x7A 00882 }, 00883 $value 00884 ); 00885 00886 // Convert more characters IE6 might treat as ascii 00887 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D 00888 $value = str_replace( 00889 array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ), 00890 array( 'r', 'n', 'n', 'l', 'i', '(', '(' ), 00891 $value 00892 ); 00893 00894 // Let the value through if it's nothing but a single comment, to 00895 // allow other functions which may reject it to pass some error 00896 // message through. 00897 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) { 00898 // Remove any comments; IE gets token splitting wrong 00899 // This must be done AFTER decoding character references and 00900 // escape sequences, because those steps can introduce comments 00901 // This step cannot introduce character references or escape 00902 // sequences, because it replaces comments with spaces rather 00903 // than removing them completely. 00904 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); 00905 00906 // Remove anything after a comment-start token, to guard against 00907 // incorrect client implementations. 00908 $commentPos = strpos( $value, '/*' ); 00909 if ( $commentPos !== false ) { 00910 $value = substr( $value, 0, $commentPos ); 00911 } 00912 } 00913 00914 // S followed by repeat, iteration, or prolonged sound marks, 00915 // which IE will treat as "ss" 00916 $value = preg_replace( 00917 '/s(?: 00918 \xE3\x80\xB1 | # U+3031 00919 \xE3\x82\x9D | # U+309D 00920 \xE3\x83\xBC | # U+30FC 00921 \xE3\x83\xBD | # U+30FD 00922 \xEF\xB9\xBC | # U+FE7C 00923 \xEF\xB9\xBD | # U+FE7D 00924 \xEF\xBD\xB0 # U+FF70 00925 )/ix', 00926 'ss', 00927 $value 00928 ); 00929 00930 return $value; 00931 } 00932 00933 00952 static function checkCss( $value ) { 00953 $value = self::normalizeCss( $value ); 00954 00955 // Reject problematic keywords and control characters 00956 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) { 00957 return '/* invalid control char */'; 00958 } elseif ( preg_match( 00959 '! expression 00960 | filter\s*: 00961 | accelerator\s*: 00962 | -o-link\s*: 00963 | -o-link-source\s*: 00964 | -o-replace\s*: 00965 | url\s*\( 00966 | image\s*\( 00967 | image-set\s*\( 00968 !ix', $value ) ) { 00969 return '/* insecure input */'; 00970 } 00971 return $value; 00972 } 00973 00978 static function cssDecodeCallback( $matches ) { 00979 if ( $matches[1] !== '' ) { 00980 // Line continuation 00981 return ''; 00982 } elseif ( $matches[2] !== '' ) { 00983 $char = codepointToUtf8( hexdec( $matches[2] ) ); 00984 } elseif ( $matches[3] !== '' ) { 00985 $char = $matches[3]; 00986 } else { 00987 $char = '\\'; 00988 } 00989 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { 00990 // These characters need to be escaped in strings 00991 // Clean up the escape sequence to avoid parsing errors by clients 00992 return '\\' . dechex( ord( $char ) ) . ' '; 00993 } else { 00994 // Decode unnecessary escape 00995 return $char; 00996 } 00997 } 00998 01018 static function fixTagAttributes( $text, $element ) { 01019 if ( trim( $text ) == '' ) { 01020 return ''; 01021 } 01022 01023 $decoded = Sanitizer::decodeTagAttributes( $text ); 01024 $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); 01025 01026 return Sanitizer::safeEncodeTagAttributes( $stripped ); 01027 } 01028 01034 static function encodeAttribute( $text ) { 01035 $encValue = htmlspecialchars( $text, ENT_QUOTES ); 01036 01037 // Whitespace is normalized during attribute decoding, 01038 // so if we've been passed non-spaces we must encode them 01039 // ahead of time or they won't be preserved. 01040 $encValue = strtr( $encValue, array( 01041 "\n" => ' ', 01042 "\r" => ' ', 01043 "\t" => '	', 01044 ) ); 01045 01046 return $encValue; 01047 } 01048 01055 static function safeEncodeAttribute( $text ) { 01056 $encValue = Sanitizer::encodeAttribute( $text ); 01057 01058 # Templates and links may be expanded in later parsing, 01059 # creating invalid or dangerous output. Suppress this. 01060 $encValue = strtr( $encValue, array( 01061 '<' => '<', // This should never happen, 01062 '>' => '>', // we've received invalid input 01063 '"' => '"', // which should have been escaped. 01064 '{' => '{', 01065 '[' => '[', 01066 "''" => '''', 01067 'ISBN' => 'ISBN', 01068 'RFC' => 'RFC', 01069 'PMID' => 'PMID', 01070 '|' => '|', 01071 '__' => '__', 01072 ) ); 01073 01074 # Stupid hack 01075 $encValue = preg_replace_callback( 01076 '/((?i)' . wfUrlProtocols() . ')/', 01077 array( 'Sanitizer', 'armorLinksCallback' ), 01078 $encValue ); 01079 return $encValue; 01080 } 01081 01113 static function escapeId( $id, $options = array() ) { 01114 global $wgExperimentalHtmlIds; 01115 $options = (array)$options; 01116 01117 $id = Sanitizer::decodeCharReferences( $id ); 01118 01119 if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { 01120 $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); 01121 $id = trim( $id, '_' ); 01122 if ( $id === '' ) { 01123 # Must have been all whitespace to start with. 01124 return '_'; 01125 } else { 01126 return $id; 01127 } 01128 } 01129 01130 # HTML4-style escaping 01131 static $replace = array( 01132 '%3A' => ':', 01133 '%' => '.' 01134 ); 01135 01136 $id = urlencode( strtr( $id, ' ', '_' ) ); 01137 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); 01138 01139 if ( !preg_match( '/^[a-zA-Z]/', $id ) 01140 && !in_array( 'noninitial', $options ) ) { 01141 // Initial character must be a letter! 01142 $id = "x$id"; 01143 } 01144 return $id; 01145 } 01146 01158 static function escapeClass( $class ) { 01159 // Convert ugly stuff to underscores and kill underscores in ugly places 01160 return rtrim( preg_replace( 01161 array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ), 01162 '_', 01163 $class ), '_' ); 01164 } 01165 01173 static function escapeHtmlAllowEntities( $html ) { 01174 $html = Sanitizer::decodeCharReferences( $html ); 01175 # It seems wise to escape ' as well as ", as a matter of course. Can't 01176 # hurt. 01177 $html = htmlspecialchars( $html, ENT_QUOTES ); 01178 return $html; 01179 } 01180 01186 private static function armorLinksCallback( $matches ) { 01187 return str_replace( ':', ':', $matches[1] ); 01188 } 01189 01198 public static function decodeTagAttributes( $text ) { 01199 if ( trim( $text ) == '' ) { 01200 return array(); 01201 } 01202 01203 $attribs = array(); 01204 $pairs = array(); 01205 if ( !preg_match_all( 01206 self::getAttribsRegex(), 01207 $text, 01208 $pairs, 01209 PREG_SET_ORDER ) ) { 01210 return $attribs; 01211 } 01212 01213 foreach ( $pairs as $set ) { 01214 $attribute = strtolower( $set[1] ); 01215 $value = Sanitizer::getTagAttributeCallback( $set ); 01216 01217 // Normalize whitespace 01218 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); 01219 $value = trim( $value ); 01220 01221 // Decode character references 01222 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); 01223 } 01224 return $attribs; 01225 } 01226 01234 public static function safeEncodeTagAttributes( $assoc_array ) { 01235 $attribs = array(); 01236 foreach ( $assoc_array as $attribute => $value ) { 01237 $encAttribute = htmlspecialchars( $attribute ); 01238 $encValue = Sanitizer::safeEncodeAttribute( $value ); 01239 01240 $attribs[] = "$encAttribute=\"$encValue\""; 01241 } 01242 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; 01243 } 01244 01253 private static function getTagAttributeCallback( $set ) { 01254 if ( isset( $set[6] ) ) { 01255 # Illegal #XXXXXX color with no quotes. 01256 return $set[6]; 01257 } elseif ( isset( $set[5] ) ) { 01258 # No quotes. 01259 return $set[5]; 01260 } elseif ( isset( $set[4] ) ) { 01261 # Single-quoted 01262 return $set[4]; 01263 } elseif ( isset( $set[3] ) ) { 01264 # Double-quoted 01265 return $set[3]; 01266 } elseif ( !isset( $set[2] ) ) { 01267 # In XHTML, attributes must have a value. 01268 # For 'reduced' form, return explicitly the attribute name here. 01269 return $set[1]; 01270 } else { 01271 throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); 01272 } 01273 } 01274 01287 private static function normalizeAttributeValue( $text ) { 01288 return str_replace( '"', '"', 01289 self::normalizeWhitespace( 01290 Sanitizer::normalizeCharReferences( $text ) ) ); 01291 } 01292 01297 private static function normalizeWhitespace( $text ) { 01298 return preg_replace( 01299 '/\r\n|[\x20\x0d\x0a\x09]/', 01300 ' ', 01301 $text ); 01302 } 01303 01312 static function normalizeSectionNameWhitespace( $section ) { 01313 return trim( preg_replace( '/[ _]+/', ' ', $section ) ); 01314 } 01315 01331 static function normalizeCharReferences( $text ) { 01332 return preg_replace_callback( 01333 self::CHAR_REFS_REGEX, 01334 array( 'Sanitizer', 'normalizeCharReferencesCallback' ), 01335 $text ); 01336 } 01337 01342 static function normalizeCharReferencesCallback( $matches ) { 01343 $ret = null; 01344 if ( $matches[1] != '' ) { 01345 $ret = Sanitizer::normalizeEntity( $matches[1] ); 01346 } elseif ( $matches[2] != '' ) { 01347 $ret = Sanitizer::decCharReference( $matches[2] ); 01348 } elseif ( $matches[3] != '' ) { 01349 $ret = Sanitizer::hexCharReference( $matches[3] ); 01350 } 01351 if ( is_null( $ret ) ) { 01352 return htmlspecialchars( $matches[0] ); 01353 } else { 01354 return $ret; 01355 } 01356 } 01357 01368 static function normalizeEntity( $name ) { 01369 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01370 return '&' . self::$htmlEntityAliases[$name] . ';'; 01371 } elseif ( in_array( $name, 01372 array( 'lt', 'gt', 'amp', 'quot' ) ) ) { 01373 return "&$name;"; 01374 } elseif ( isset( self::$htmlEntities[$name] ) ) { 01375 return '&#' . self::$htmlEntities[$name] . ';'; 01376 } else { 01377 return "&$name;"; 01378 } 01379 } 01380 01385 static function decCharReference( $codepoint ) { 01386 $point = intval( $codepoint ); 01387 if ( Sanitizer::validateCodepoint( $point ) ) { 01388 return sprintf( '&#%d;', $point ); 01389 } else { 01390 return null; 01391 } 01392 } 01393 01398 static function hexCharReference( $codepoint ) { 01399 $point = hexdec( $codepoint ); 01400 if ( Sanitizer::validateCodepoint( $point ) ) { 01401 return sprintf( '&#x%x;', $point ); 01402 } else { 01403 return null; 01404 } 01405 } 01406 01412 private static function validateCodepoint( $codepoint ) { 01413 return $codepoint == 0x09 01414 || $codepoint == 0x0a 01415 || $codepoint == 0x0d 01416 || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff ) 01417 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) 01418 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); 01419 } 01420 01428 public static function decodeCharReferences( $text ) { 01429 return preg_replace_callback( 01430 self::CHAR_REFS_REGEX, 01431 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01432 $text ); 01433 } 01434 01445 public static function decodeCharReferencesAndNormalize( $text ) { 01446 global $wgContLang; 01447 $text = preg_replace_callback( 01448 self::CHAR_REFS_REGEX, 01449 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01450 $text, /* limit */ -1, $count ); 01451 01452 if ( $count ) { 01453 return $wgContLang->normalize( $text ); 01454 } else { 01455 return $text; 01456 } 01457 } 01458 01463 static function decodeCharReferencesCallback( $matches ) { 01464 if ( $matches[1] != '' ) { 01465 return Sanitizer::decodeEntity( $matches[1] ); 01466 } elseif ( $matches[2] != '' ) { 01467 return Sanitizer::decodeChar( intval( $matches[2] ) ); 01468 } elseif ( $matches[3] != '' ) { 01469 return Sanitizer::decodeChar( hexdec( $matches[3] ) ); 01470 } 01471 # Last case should be an ampersand by itself 01472 return $matches[0]; 01473 } 01474 01482 static function decodeChar( $codepoint ) { 01483 if ( Sanitizer::validateCodepoint( $codepoint ) ) { 01484 return codepointToUtf8( $codepoint ); 01485 } else { 01486 return UTF8_REPLACEMENT; 01487 } 01488 } 01489 01498 static function decodeEntity( $name ) { 01499 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01500 $name = self::$htmlEntityAliases[$name]; 01501 } 01502 if ( isset( self::$htmlEntities[$name] ) ) { 01503 return codepointToUtf8( self::$htmlEntities[$name] ); 01504 } else { 01505 return "&$name;"; 01506 } 01507 } 01508 01515 static function attributeWhitelist( $element ) { 01516 $list = Sanitizer::setupAttributeWhitelist(); 01517 return isset( $list[$element] ) 01518 ? $list[$element] 01519 : array(); 01520 } 01521 01527 static function setupAttributeWhitelist() { 01528 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes; 01529 static $whitelist, $staticInitialised; 01530 01531 $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) ); 01532 01533 if ( $whitelist !== null && $staticInitialised == $globalContext ) { 01534 return $whitelist; 01535 } 01536 01537 $common = array( 01538 # HTML 01539 'id', 01540 'class', 01541 'style', 01542 'lang', 01543 'dir', 01544 'title', 01545 01546 # WAI-ARIA 01547 'role', 01548 ); 01549 01550 if ( $wgAllowRdfaAttributes ) { 01551 # RDFa attributes as specified in section 9 of 01552 # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 01553 $common = array_merge( $common, array( 01554 'about', 'property', 'resource', 'datatype', 'typeof', 01555 ) ); 01556 } 01557 01558 if ( $wgAllowMicrodataAttributes ) { 01559 # add HTML5 microdata tags as specified by 01560 # http://www.whatwg.org/html/microdata.html#the-microdata-model 01561 $common = array_merge( $common, array( 01562 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype' 01563 ) ); 01564 } 01565 01566 $block = array_merge( $common, array( 'align' ) ); 01567 $tablealign = array( 'align', 'valign' ); 01568 $tablecell = array( 01569 'abbr', 01570 'axis', 01571 'headers', 01572 'scope', 01573 'rowspan', 01574 'colspan', 01575 'nowrap', # deprecated 01576 'width', # deprecated 01577 'height', # deprecated 01578 'bgcolor', # deprecated 01579 ); 01580 01581 # Numbers refer to sections in HTML 4.01 standard describing the element. 01582 # See: http://www.w3.org/TR/html4/ 01583 $whitelist = array( 01584 # 7.5.4 01585 'div' => $block, 01586 'center' => $common, # deprecated 01587 'span' => $common, 01588 01589 # 7.5.5 01590 'h1' => $block, 01591 'h2' => $block, 01592 'h3' => $block, 01593 'h4' => $block, 01594 'h5' => $block, 01595 'h6' => $block, 01596 01597 # 7.5.6 01598 # address 01599 01600 # 8.2.4 01601 'bdo' => $common, 01602 01603 # 9.2.1 01604 'em' => $common, 01605 'strong' => $common, 01606 'cite' => $common, 01607 'dfn' => $common, 01608 'code' => $common, 01609 'samp' => $common, 01610 'kbd' => $common, 01611 'var' => $common, 01612 'abbr' => $common, 01613 # acronym 01614 01615 # 9.2.2 01616 'blockquote' => array_merge( $common, array( 'cite' ) ), 01617 'q' => array_merge( $common, array( 'cite' ) ), 01618 01619 # 9.2.3 01620 'sub' => $common, 01621 'sup' => $common, 01622 01623 # 9.3.1 01624 'p' => $block, 01625 01626 # 9.3.2 01627 'br' => array_merge( $common, array( 'clear' ) ), 01628 01629 # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element 01630 'wbr' => $common, 01631 01632 # 9.3.4 01633 'pre' => array_merge( $common, array( 'width' ) ), 01634 01635 # 9.4 01636 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), 01637 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), 01638 01639 # 10.2 01640 'ul' => array_merge( $common, array( 'type' ) ), 01641 'ol' => array_merge( $common, array( 'type', 'start' ) ), 01642 'li' => array_merge( $common, array( 'type', 'value' ) ), 01643 01644 # 10.3 01645 'dl' => $common, 01646 'dd' => $common, 01647 'dt' => $common, 01648 01649 # 11.2.1 01650 'table' => array_merge( $common, 01651 array( 'summary', 'width', 'border', 'frame', 01652 'rules', 'cellspacing', 'cellpadding', 01653 'align', 'bgcolor', 01654 ) ), 01655 01656 # 11.2.2 01657 'caption' => $block, 01658 01659 # 11.2.3 01660 'thead' => $common, 01661 'tfoot' => $common, 01662 'tbody' => $common, 01663 01664 # 11.2.4 01665 'colgroup' => array_merge( $common, array( 'span' ) ), 01666 'col' => array_merge( $common, array( 'span' ) ), 01667 01668 # 11.2.5 01669 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), 01670 01671 # 11.2.6 01672 'td' => array_merge( $common, $tablecell, $tablealign ), 01673 'th' => array_merge( $common, $tablecell, $tablealign ), 01674 01675 # 12.2 01676 # NOTE: <a> is not allowed directly, but the attrib 01677 # whitelist is used from the Parser object 01678 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa 01679 01680 # 13.2 01681 # Not usually allowed, but may be used for extension-style hooks 01682 # such as <math> when it is rasterized, or if $wgAllowImageTag is 01683 # true 01684 'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ), 01685 01686 # 15.2.1 01687 'tt' => $common, 01688 'b' => $common, 01689 'i' => $common, 01690 'big' => $common, 01691 'small' => $common, 01692 'strike' => $common, 01693 's' => $common, 01694 'u' => $common, 01695 01696 # 15.2.2 01697 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), 01698 # basefont 01699 01700 # 15.3 01701 'hr' => array_merge( $common, array( 'width' ) ), 01702 01703 # HTML Ruby annotation text module, simple ruby only. 01704 # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element 01705 'ruby' => $common, 01706 # rbc 01707 'rb' => $common, 01708 'rp' => $common, 01709 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), 01710 'rtc' => $common, 01711 01712 # MathML root element, where used for extensions 01713 # 'title' may not be 100% valid here; it's XHTML 01714 # http://www.w3.org/TR/REC-MathML/ 01715 'math' => array( 'class', 'style', 'id', 'title' ), 01716 01717 # HTML 5 section 4.6 01718 'bdi' => $common, 01719 01720 # HTML5 elements, defined by: 01721 # http://www.whatwg.org/html/ 01722 'data' => array_merge( $common, array( 'value' ) ), 01723 'time' => array_merge( $common, array( 'datetime' ) ), 01724 'mark' => $common, 01725 01726 // meta and link are only permitted by removeHTMLtags when Microdata 01727 // is enabled so we don't bother adding a conditional to hide these 01728 // Also meta and link are only valid in WikiText as Microdata elements 01729 // (ie: validateTag rejects tags missing the attributes needed for Microdata) 01730 // So we don't bother including $common attributes that have no purpose. 01731 'meta' => array( 'itemprop', 'content' ), 01732 'link' => array( 'itemprop', 'href' ), 01733 ); 01734 01735 $staticInitialised = $globalContext; 01736 01737 return $whitelist; 01738 } 01739 01750 static function stripAllTags( $text ) { 01751 # Actual <tags> 01752 $text = StringUtils::delimiterReplace( '<', '>', '', $text ); 01753 01754 # Normalize &entities and whitespace 01755 $text = self::decodeCharReferences( $text ); 01756 $text = self::normalizeWhitespace( $text ); 01757 01758 return $text; 01759 } 01760 01770 static function hackDocType() { 01771 $out = "<!DOCTYPE html [\n"; 01772 foreach ( self::$htmlEntities as $entity => $codepoint ) { 01773 $out .= "<!ENTITY $entity \"&#$codepoint;\">"; 01774 } 01775 $out .= "]>\n"; 01776 return $out; 01777 } 01778 01783 static function cleanUrl( $url ) { 01784 # Normalize any HTML entities in input. They will be 01785 # re-escaped by makeExternalLink(). 01786 $url = Sanitizer::decodeCharReferences( $url ); 01787 01788 # Escape any control characters introduced by the above step 01789 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', 01790 array( __CLASS__, 'cleanUrlCallback' ), $url ); 01791 01792 # Validate hostname portion 01793 $matches = array(); 01794 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { 01795 list( /* $whole */, $protocol, $host, $rest ) = $matches; 01796 01797 // Characters that will be ignored in IDNs. 01798 // http://tools.ietf.org/html/3454#section-3.1 01799 // Strip them before further processing so blacklists and such work. 01800 $strip = "/ 01801 \\s| # general whitespace 01802 \xc2\xad| # 00ad SOFT HYPHEN 01803 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 01804 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 01805 \xe2\x81\xa0| # 2060 WORD JOINER 01806 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 01807 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER 01808 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 01809 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 01810 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 01811 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 01812 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 01813 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16 01814 /xuD"; 01815 01816 $host = preg_replace( $strip, '', $host ); 01817 01818 // @todo FIXME: Validate hostnames here 01819 01820 return $protocol . $host . $rest; 01821 } else { 01822 return $url; 01823 } 01824 } 01825 01830 static function cleanUrlCallback( $matches ) { 01831 return urlencode( $matches[0] ); 01832 } 01833 01862 public static function validateEmail( $addr ) { 01863 $result = null; 01864 if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) { 01865 return $result; 01866 } 01867 01868 // Please note strings below are enclosed in brackets [], this make the 01869 // hyphen "-" a range indicator. Hence it is double backslashed below. 01870 // See bug 26948 01871 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~"; 01872 $rfc1034_ldh_str = "a-z0-9\\-"; 01873 01874 $html5_email_regexp = "/ 01875 ^ # start of string 01876 [$rfc5322_atext\\.]+ # user part which is liberal :p 01877 @ # 'apostrophe' 01878 [$rfc1034_ldh_str]+ # First domain part 01879 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot 01880 $ # End of string 01881 /ix"; // case Insensitive, eXtended 01882 01883 return (bool)preg_match( $html5_email_regexp, $addr ); 01884 } 01885 }