MediaWiki
REL1_21
|
00001 <?php 00031 class Sanitizer { 00036 const CHAR_REFS_REGEX = 00037 '/&([A-Za-z0-9\x80-\xff]+); 00038 |&\#([0-9]+); 00039 |&\#[xX]([0-9A-Fa-f]+); 00040 |(&)/x'; 00041 00050 const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; 00051 const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; 00052 00059 static $htmlEntities = array( 00060 'Aacute' => 193, 00061 'aacute' => 225, 00062 'Acirc' => 194, 00063 'acirc' => 226, 00064 'acute' => 180, 00065 'AElig' => 198, 00066 'aelig' => 230, 00067 'Agrave' => 192, 00068 'agrave' => 224, 00069 'alefsym' => 8501, 00070 'Alpha' => 913, 00071 'alpha' => 945, 00072 'amp' => 38, 00073 'and' => 8743, 00074 'ang' => 8736, 00075 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. 00076 'Aring' => 197, 00077 'aring' => 229, 00078 'asymp' => 8776, 00079 'Atilde' => 195, 00080 'atilde' => 227, 00081 'Auml' => 196, 00082 'auml' => 228, 00083 'bdquo' => 8222, 00084 'Beta' => 914, 00085 'beta' => 946, 00086 'brvbar' => 166, 00087 'bull' => 8226, 00088 'cap' => 8745, 00089 'Ccedil' => 199, 00090 'ccedil' => 231, 00091 'cedil' => 184, 00092 'cent' => 162, 00093 'Chi' => 935, 00094 'chi' => 967, 00095 'circ' => 710, 00096 'clubs' => 9827, 00097 'cong' => 8773, 00098 'copy' => 169, 00099 'crarr' => 8629, 00100 'cup' => 8746, 00101 'curren' => 164, 00102 'dagger' => 8224, 00103 'Dagger' => 8225, 00104 'darr' => 8595, 00105 'dArr' => 8659, 00106 'deg' => 176, 00107 'Delta' => 916, 00108 'delta' => 948, 00109 'diams' => 9830, 00110 'divide' => 247, 00111 'Eacute' => 201, 00112 'eacute' => 233, 00113 'Ecirc' => 202, 00114 'ecirc' => 234, 00115 'Egrave' => 200, 00116 'egrave' => 232, 00117 'empty' => 8709, 00118 'emsp' => 8195, 00119 'ensp' => 8194, 00120 'Epsilon' => 917, 00121 'epsilon' => 949, 00122 'equiv' => 8801, 00123 'Eta' => 919, 00124 'eta' => 951, 00125 'ETH' => 208, 00126 'eth' => 240, 00127 'Euml' => 203, 00128 'euml' => 235, 00129 'euro' => 8364, 00130 'exist' => 8707, 00131 'fnof' => 402, 00132 'forall' => 8704, 00133 'frac12' => 189, 00134 'frac14' => 188, 00135 'frac34' => 190, 00136 'frasl' => 8260, 00137 'Gamma' => 915, 00138 'gamma' => 947, 00139 'ge' => 8805, 00140 'gt' => 62, 00141 'harr' => 8596, 00142 'hArr' => 8660, 00143 'hearts' => 9829, 00144 'hellip' => 8230, 00145 'Iacute' => 205, 00146 'iacute' => 237, 00147 'Icirc' => 206, 00148 'icirc' => 238, 00149 'iexcl' => 161, 00150 'Igrave' => 204, 00151 'igrave' => 236, 00152 'image' => 8465, 00153 'infin' => 8734, 00154 'int' => 8747, 00155 'Iota' => 921, 00156 'iota' => 953, 00157 'iquest' => 191, 00158 'isin' => 8712, 00159 'Iuml' => 207, 00160 'iuml' => 239, 00161 'Kappa' => 922, 00162 'kappa' => 954, 00163 'Lambda' => 923, 00164 'lambda' => 955, 00165 'lang' => 9001, 00166 'laquo' => 171, 00167 'larr' => 8592, 00168 'lArr' => 8656, 00169 'lceil' => 8968, 00170 'ldquo' => 8220, 00171 'le' => 8804, 00172 'lfloor' => 8970, 00173 'lowast' => 8727, 00174 'loz' => 9674, 00175 'lrm' => 8206, 00176 'lsaquo' => 8249, 00177 'lsquo' => 8216, 00178 'lt' => 60, 00179 'macr' => 175, 00180 'mdash' => 8212, 00181 'micro' => 181, 00182 'middot' => 183, 00183 'minus' => 8722, 00184 'Mu' => 924, 00185 'mu' => 956, 00186 'nabla' => 8711, 00187 'nbsp' => 160, 00188 'ndash' => 8211, 00189 'ne' => 8800, 00190 'ni' => 8715, 00191 'not' => 172, 00192 'notin' => 8713, 00193 'nsub' => 8836, 00194 'Ntilde' => 209, 00195 'ntilde' => 241, 00196 'Nu' => 925, 00197 'nu' => 957, 00198 'Oacute' => 211, 00199 'oacute' => 243, 00200 'Ocirc' => 212, 00201 'ocirc' => 244, 00202 'OElig' => 338, 00203 'oelig' => 339, 00204 'Ograve' => 210, 00205 'ograve' => 242, 00206 'oline' => 8254, 00207 'Omega' => 937, 00208 'omega' => 969, 00209 'Omicron' => 927, 00210 'omicron' => 959, 00211 'oplus' => 8853, 00212 'or' => 8744, 00213 'ordf' => 170, 00214 'ordm' => 186, 00215 'Oslash' => 216, 00216 'oslash' => 248, 00217 'Otilde' => 213, 00218 'otilde' => 245, 00219 'otimes' => 8855, 00220 'Ouml' => 214, 00221 'ouml' => 246, 00222 'para' => 182, 00223 'part' => 8706, 00224 'permil' => 8240, 00225 'perp' => 8869, 00226 'Phi' => 934, 00227 'phi' => 966, 00228 'Pi' => 928, 00229 'pi' => 960, 00230 'piv' => 982, 00231 'plusmn' => 177, 00232 'pound' => 163, 00233 'prime' => 8242, 00234 'Prime' => 8243, 00235 'prod' => 8719, 00236 'prop' => 8733, 00237 'Psi' => 936, 00238 'psi' => 968, 00239 'quot' => 34, 00240 'radic' => 8730, 00241 'rang' => 9002, 00242 'raquo' => 187, 00243 'rarr' => 8594, 00244 'rArr' => 8658, 00245 'rceil' => 8969, 00246 'rdquo' => 8221, 00247 'real' => 8476, 00248 'reg' => 174, 00249 'rfloor' => 8971, 00250 'Rho' => 929, 00251 'rho' => 961, 00252 'rlm' => 8207, 00253 'rsaquo' => 8250, 00254 'rsquo' => 8217, 00255 'sbquo' => 8218, 00256 'Scaron' => 352, 00257 'scaron' => 353, 00258 'sdot' => 8901, 00259 'sect' => 167, 00260 'shy' => 173, 00261 'Sigma' => 931, 00262 'sigma' => 963, 00263 'sigmaf' => 962, 00264 'sim' => 8764, 00265 'spades' => 9824, 00266 'sub' => 8834, 00267 'sube' => 8838, 00268 'sum' => 8721, 00269 'sup' => 8835, 00270 'sup1' => 185, 00271 'sup2' => 178, 00272 'sup3' => 179, 00273 'supe' => 8839, 00274 'szlig' => 223, 00275 'Tau' => 932, 00276 'tau' => 964, 00277 'there4' => 8756, 00278 'Theta' => 920, 00279 'theta' => 952, 00280 'thetasym' => 977, 00281 'thinsp' => 8201, 00282 'THORN' => 222, 00283 'thorn' => 254, 00284 'tilde' => 732, 00285 'times' => 215, 00286 'trade' => 8482, 00287 'Uacute' => 218, 00288 'uacute' => 250, 00289 'uarr' => 8593, 00290 'uArr' => 8657, 00291 'Ucirc' => 219, 00292 'ucirc' => 251, 00293 'Ugrave' => 217, 00294 'ugrave' => 249, 00295 'uml' => 168, 00296 'upsih' => 978, 00297 'Upsilon' => 933, 00298 'upsilon' => 965, 00299 'Uuml' => 220, 00300 'uuml' => 252, 00301 'weierp' => 8472, 00302 'Xi' => 926, 00303 'xi' => 958, 00304 'Yacute' => 221, 00305 'yacute' => 253, 00306 'yen' => 165, 00307 'Yuml' => 376, 00308 'yuml' => 255, 00309 'Zeta' => 918, 00310 'zeta' => 950, 00311 'zwj' => 8205, 00312 'zwnj' => 8204 00313 ); 00314 00318 static $htmlEntityAliases = array( 00319 'רלמ' => 'rlm', 00320 'رلم' => 'rlm', 00321 ); 00322 00326 static $attribsRegex; 00327 00333 static function getAttribsRegex() { 00334 if ( self::$attribsRegex === null ) { 00335 $attribFirst = '[:A-Z_a-z0-9]'; 00336 $attrib = '[:A-Z_a-z-.0-9]'; 00337 $space = '[\x09\x0a\x0d\x20]'; 00338 self::$attribsRegex = 00339 "/(?:^|$space)({$attribFirst}{$attrib}*) 00340 ($space*=$space* 00341 (?: 00342 # The attribute value: quoted or alone 00343 \"([^<\"]*)\" 00344 | '([^<']*)' 00345 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) 00346 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of 00347 # colors are specified like this. 00348 # We'll be normalizing it. 00349 ) 00350 )?(?=$space|\$)/sx"; 00351 } 00352 return self::$attribsRegex; 00353 } 00354 00366 static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) { 00367 global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag; 00368 00369 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, 00370 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; 00371 00372 wfProfileIn( __METHOD__ ); 00373 00374 // Base our staticInitialised variable off of the global config state so that if the globals 00375 // are changed (like in the screwed up test system) we will re-initialise the settings. 00376 $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) ); 00377 if ( !$staticInitialised || $staticInitialised != $globalContext ) { 00378 00379 $htmlpairsStatic = array( # Tags that must be closed 00380 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 00381 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 00382 'strike', 'strong', 'tt', 'var', 'div', 'center', 00383 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 00384 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn', 00385 'kbd', 'samp' 00386 ); 00387 if ( $wgHtml5 ) { 00388 $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) ); 00389 } 00390 $htmlsingle = array( 00391 'br', 'hr', 'li', 'dt', 'dd' 00392 ); 00393 $htmlsingleonly = array( # Elements that cannot have close tags 00394 'br', 'hr' 00395 ); 00396 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) { 00397 $htmlsingle[] = $htmlsingleonly[] = 'meta'; 00398 $htmlsingle[] = $htmlsingleonly[] = 'link'; 00399 } 00400 $htmlnest = array( # Tags that can be nested--?? 00401 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 00402 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span' 00403 ); 00404 $tabletags = array( # Can only appear inside table, we will close them 00405 'td', 'th', 'tr', 00406 ); 00407 $htmllist = array( # Tags used by list 00408 'ul','ol', 00409 ); 00410 $listtags = array( # Tags that can appear in a list 00411 'li', 00412 ); 00413 00414 if ( $wgAllowImageTag ) { 00415 $htmlsingle[] = 'img'; 00416 $htmlsingleonly[] = 'img'; 00417 } 00418 00419 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); 00420 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); 00421 00422 # Convert them all to hashtables for faster lookup 00423 $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 00424 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ); 00425 foreach ( $vars as $var ) { 00426 $$var = array_flip( $$var ); 00427 } 00428 $staticInitialised = $globalContext; 00429 } 00430 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays 00431 $extratags = array_flip( $extratags ); 00432 $removetags = array_flip( $removetags ); 00433 $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); 00434 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); 00435 00436 # Remove HTML comments 00437 $text = Sanitizer::removeHTMLcomments( $text ); 00438 $bits = explode( '<', $text ); 00439 $text = str_replace( '>', '>', array_shift( $bits ) ); 00440 if ( !$wgUseTidy ) { 00441 $tagstack = $tablestack = array(); 00442 foreach ( $bits as $x ) { 00443 $regs = array(); 00444 # $slash: Does the current element start with a '/'? 00445 # $t: Current element name 00446 # $params: String between element name and > 00447 # $brace: Ending '>' or '/>' 00448 # $rest: Everything until the next element of $bits 00449 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { 00450 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00451 } else { 00452 $slash = $t = $params = $brace = $rest = null; 00453 } 00454 00455 $badtag = false; 00456 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00457 # Check our stack 00458 if ( $slash && isset( $htmlsingleonly[$t] ) ) { 00459 $badtag = true; 00460 } elseif ( $slash ) { 00461 # Closing a tag... is it the one we just opened? 00462 $ot = @array_pop( $tagstack ); 00463 if ( $ot != $t ) { 00464 if ( isset( $htmlsingleallowed[$ot] ) ) { 00465 # Pop all elements with an optional close tag 00466 # and see if we find a match below them 00467 $optstack = array(); 00468 array_push( $optstack, $ot ); 00469 wfSuppressWarnings(); 00470 $ot = array_pop( $tagstack ); 00471 wfRestoreWarnings(); 00472 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { 00473 array_push( $optstack, $ot ); 00474 wfSuppressWarnings(); 00475 $ot = array_pop( $tagstack ); 00476 wfRestoreWarnings(); 00477 } 00478 if ( $t != $ot ) { 00479 # No match. Push the optional elements back again 00480 $badtag = true; 00481 wfSuppressWarnings(); 00482 $ot = array_pop( $optstack ); 00483 wfRestoreWarnings(); 00484 while ( $ot ) { 00485 array_push( $tagstack, $ot ); 00486 wfSuppressWarnings(); 00487 $ot = array_pop( $optstack ); 00488 wfRestoreWarnings(); 00489 } 00490 } 00491 } else { 00492 @array_push( $tagstack, $ot ); 00493 # <li> can be nested in <ul> or <ol>, skip those cases: 00494 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) { 00495 $badtag = true; 00496 } 00497 } 00498 } else { 00499 if ( $t == 'table' ) { 00500 $tagstack = array_pop( $tablestack ); 00501 } 00502 } 00503 $newparams = ''; 00504 } else { 00505 # Keep track for later 00506 if ( isset( $tabletags[$t] ) && 00507 !in_array( 'table', $tagstack ) ) { 00508 $badtag = true; 00509 } elseif ( in_array( $t, $tagstack ) && 00510 !isset( $htmlnest [$t ] ) ) { 00511 $badtag = true; 00512 # Is it a self closed htmlpair ? (bug 5487) 00513 } elseif ( $brace == '/>' && 00514 isset( $htmlpairs[$t] ) ) { 00515 $badtag = true; 00516 } elseif ( isset( $htmlsingleonly[$t] ) ) { 00517 # Hack to force empty tag for unclosable elements 00518 $brace = '/>'; 00519 } elseif ( isset( $htmlsingle[$t] ) ) { 00520 # Hack to not close $htmlsingle tags 00521 $brace = null; 00522 # Still need to push this optionally-closed tag to 00523 # the tag stack so that we can match end tags 00524 # instead of marking them as bad. 00525 array_push( $tagstack, $t ); 00526 } elseif ( isset( $tabletags[$t] ) 00527 && in_array( $t, $tagstack ) ) { 00528 // New table tag but forgot to close the previous one 00529 $text .= "</$t>"; 00530 } else { 00531 if ( $t == 'table' ) { 00532 array_push( $tablestack, $tagstack ); 00533 $tagstack = array(); 00534 } 00535 array_push( $tagstack, $t ); 00536 } 00537 00538 # Replace any variables or template parameters with 00539 # plaintext results. 00540 if( is_callable( $processCallback ) ) { 00541 call_user_func_array( $processCallback, array( &$params, $args ) ); 00542 } 00543 00544 if ( !Sanitizer::validateTag( $params, $t ) ) { 00545 $badtag = true; 00546 } 00547 00548 # Strip non-approved attributes from the tag 00549 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00550 } 00551 if ( !$badtag ) { 00552 $rest = str_replace( '>', '>', $rest ); 00553 $close = ( $brace == '/>' && !$slash ) ? ' /' : ''; 00554 $text .= "<$slash$t$newparams$close>$rest"; 00555 continue; 00556 } 00557 } 00558 $text .= '<' . str_replace( '>', '>', $x); 00559 } 00560 # Close off any remaining tags 00561 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) { 00562 $text .= "</$t>\n"; 00563 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); } 00564 } 00565 } else { 00566 # this might be possible using tidy itself 00567 foreach ( $bits as $x ) { 00568 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', 00569 $x, $regs ); 00570 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00571 $badtag = false; 00572 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00573 if( is_callable( $processCallback ) ) { 00574 call_user_func_array( $processCallback, array( &$params, $args ) ); 00575 } 00576 00577 if ( !Sanitizer::validateTag( $params, $t ) ) { 00578 $badtag = true; 00579 } 00580 00581 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00582 if ( !$badtag ) { 00583 $rest = str_replace( '>', '>', $rest ); 00584 $text .= "<$slash$t$newparams$brace$rest"; 00585 continue; 00586 } 00587 } 00588 $text .= '<' . str_replace( '>', '>', $x); 00589 } 00590 } 00591 wfProfileOut( __METHOD__ ); 00592 return $text; 00593 } 00594 00605 static function removeHTMLcomments( $text ) { 00606 wfProfileIn( __METHOD__ ); 00607 while ( ($start = strpos( $text, '<!--' ) ) !== false ) { 00608 $end = strpos( $text, '-->', $start + 4 ); 00609 if ( $end === false ) { 00610 # Unterminated comment; bail out 00611 break; 00612 } 00613 00614 $end += 3; 00615 00616 # Trim space and newline if the comment is both 00617 # preceded and followed by a newline 00618 $spaceStart = max( $start - 1, 0 ); 00619 $spaceLen = $end - $spaceStart; 00620 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) { 00621 $spaceStart--; 00622 $spaceLen++; 00623 } 00624 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) 00625 $spaceLen++; 00626 if ( substr( $text, $spaceStart, 1 ) === "\n" and substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) { 00627 # Remove the comment, leading and trailing 00628 # spaces, and leave only one newline. 00629 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 ); 00630 } 00631 else { 00632 # Remove just the comment. 00633 $text = substr_replace( $text, '', $start, $end - $start ); 00634 } 00635 } 00636 wfProfileOut( __METHOD__ ); 00637 return $text; 00638 } 00639 00652 static function validateTag( $params, $element ) { 00653 $params = Sanitizer::decodeTagAttributes( $params ); 00654 00655 if ( $element == 'meta' || $element == 'link' ) { 00656 if ( !isset( $params['itemprop'] ) ) { 00657 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content 00658 return false; 00659 } 00660 if ( $element == 'meta' && !isset( $params['content'] ) ) { 00661 // <meta> must have a content="" for the itemprop 00662 return false; 00663 } 00664 if ( $element == 'link' && !isset( $params['href'] ) ) { 00665 // <link> must have an associated href="" 00666 return false; 00667 } 00668 } 00669 00670 return true; 00671 } 00672 00688 static function validateTagAttributes( $attribs, $element ) { 00689 return Sanitizer::validateAttributes( $attribs, 00690 Sanitizer::attributeWhitelist( $element ) ); 00691 } 00692 00708 static function validateAttributes( $attribs, $whitelist ) { 00709 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5; 00710 00711 $whitelist = array_flip( $whitelist ); 00712 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; 00713 00714 $out = array(); 00715 foreach( $attribs as $attribute => $value ) { 00716 #allow XML namespace declaration if RDFa is enabled 00717 if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { 00718 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00719 $out[$attribute] = $value; 00720 } 00721 00722 continue; 00723 } 00724 00725 # Allow any attribute beginning with "data-", if in HTML5 mode 00726 if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) { 00727 continue; 00728 } 00729 00730 # Strip javascript "expression" from stylesheets. 00731 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp 00732 if( $attribute == 'style' ) { 00733 $value = Sanitizer::checkCss( $value ); 00734 } 00735 00736 if ( $attribute === 'id' ) { 00737 $value = Sanitizer::escapeId( $value, 'noninitial' ); 00738 } 00739 00740 # WAI-ARIA 00741 # http://www.w3.org/TR/wai-aria/ 00742 # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#wai-aria 00743 # For now we only support role="presentation" until we work out what roles should be 00744 # usable by content and we ensure that our code explicitly rejects patterns that 00745 # violate HTML5's ARIA restrictions. 00746 if ( $attribute === 'role' && $value !== 'presentation' ) { 00747 continue; 00748 } 00749 00750 //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity 00751 if ( $attribute === 'rel' || $attribute === 'rev' || 00752 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa 00753 $attribute === 'datatype' || $attribute === 'typeof' || #RDFa 00754 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata 00755 $attribute === 'itemscope' || $attribute === 'itemtype' ) { #HTML5 microdata 00756 00757 //Paranoia. Allow "simple" values but suppress javascript 00758 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00759 continue; 00760 } 00761 } 00762 00763 # NOTE: even though elements using href/src are not allowed directly, supply 00764 # validation code that can be used by tag hook handlers, etc 00765 if ( $attribute === 'href' || $attribute === 'src' ) { 00766 if ( !preg_match( $hrefExp, $value ) ) { 00767 continue; //drop any href or src attributes not using an allowed protocol. 00768 //NOTE: this also drops all relative URLs 00769 } 00770 } 00771 00772 // If this attribute was previously set, override it. 00773 // Output should only have one attribute of each name. 00774 $out[$attribute] = $value; 00775 } 00776 00777 if ( $wgAllowMicrodataAttributes ) { 00778 # itemtype, itemid, itemref don't make sense without itemscope 00779 if ( !array_key_exists( 'itemscope', $out ) ) { 00780 unset( $out['itemtype'] ); 00781 unset( $out['itemid'] ); 00782 unset( $out['itemref'] ); 00783 } 00784 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. 00785 } 00786 return $out; 00787 } 00788 00799 static function mergeAttributes( $a, $b ) { 00800 $out = array_merge( $a, $b ); 00801 if( isset( $a['class'] ) && isset( $b['class'] ) 00802 && is_string( $a['class'] ) && is_string( $b['class'] ) 00803 && $a['class'] !== $b['class'] ) { 00804 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", 00805 -1, PREG_SPLIT_NO_EMPTY ); 00806 $out['class'] = implode( ' ', array_unique( $classes ) ); 00807 } 00808 return $out; 00809 } 00810 00828 static function checkCss( $value ) { 00829 // Decode character references like { 00830 $value = Sanitizer::decodeCharReferences( $value ); 00831 00832 // Decode escape sequences and line continuation 00833 // See the grammar in the CSS 2 spec, appendix D. 00834 // This has to be done AFTER decoding character references. 00835 // This means it isn't possible for this function to return 00836 // unsanitized escape sequences. It is possible to manufacture 00837 // input that contains character references that decode to 00838 // escape sequences that decode to character references, but 00839 // it's OK for the return value to contain character references 00840 // because the caller is supposed to escape those anyway. 00841 static $decodeRegex; 00842 if ( !$decodeRegex ) { 00843 $space = '[\\x20\\t\\r\\n\\f]'; 00844 $nl = '(?:\\n|\\r\\n|\\r|\\f)'; 00845 $backslash = '\\\\'; 00846 $decodeRegex = "/ $backslash 00847 (?: 00848 ($nl) | # 1. Line continuation 00849 ([0-9A-Fa-f]{1,6})$space? | # 2. character number 00850 (.) | # 3. backslash cancelling special meaning 00851 () | # 4. backslash at end of string 00852 )/xu"; 00853 } 00854 $value = preg_replace_callback( $decodeRegex, 00855 array( __CLASS__, 'cssDecodeCallback' ), $value ); 00856 00857 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii 00858 $value = preg_replace_callback( 00859 '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088) 00860 function ( $matches ) { 00861 $cp = utf8ToCodepoint( $matches[0] ); 00862 if ( $cp === false ) { 00863 return ''; 00864 } 00865 return chr( $cp - 65248 ); // ASCII range \x21-\x7A 00866 }, 00867 $value 00868 ); 00869 00870 // Convert more characters IE6 might treat as ascii 00871 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D 00872 $value = str_replace( 00873 array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ), 00874 array( 'r', 'n', 'n', 'l', 'i', '(', '(' ), 00875 $value 00876 ); 00877 00878 // Remove any comments; IE gets token splitting wrong 00879 // This must be done AFTER decoding character references and 00880 // escape sequences, because those steps can introduce comments 00881 // This step cannot introduce character references or escape 00882 // sequences, because it replaces comments with spaces rather 00883 // than removing them completely. 00884 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); 00885 00886 // Remove anything after a comment-start token, to guard against 00887 // incorrect client implementations. 00888 $commentPos = strpos( $value, '/*' ); 00889 if ( $commentPos !== false ) { 00890 $value = substr( $value, 0, $commentPos ); 00891 } 00892 00893 // S followed by repeat, iteration, or prolonged sound marks, 00894 // which IE will treat as "ss" 00895 $value = preg_replace( 00896 '/s(?: 00897 \xE3\x80\xB1 | # U+3031 00898 \xE3\x82\x9D | # U+309D 00899 \xE3\x83\xBC | # U+30FC 00900 \xE3\x83\xBD | # U+30FD 00901 \xEF\xB9\xBC | # U+FE7C 00902 \xEF\xB9\xBD | # U+FE7D 00903 \xEF\xBD\xB0 # U+FF70 00904 )/ix', 00905 'ss', 00906 $value 00907 ); 00908 00909 // Reject problematic keywords and control characters 00910 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) { 00911 return '/* invalid control char */'; 00912 } elseif ( preg_match( 00913 '! expression 00914 | filter\s*: 00915 | accelerator\s*: 00916 | -o-link\s*: 00917 | -o-link-source\s*: 00918 | -o-replace\s*: 00919 | url\s*\( 00920 | image\s*\( 00921 | image-set\s*\( 00922 !ix', $value ) ) { 00923 return '/* insecure input */'; 00924 } 00925 return $value; 00926 } 00927 00932 static function cssDecodeCallback( $matches ) { 00933 if ( $matches[1] !== '' ) { 00934 // Line continuation 00935 return ''; 00936 } elseif ( $matches[2] !== '' ) { 00937 $char = codepointToUtf8( hexdec( $matches[2] ) ); 00938 } elseif ( $matches[3] !== '' ) { 00939 $char = $matches[3]; 00940 } else { 00941 $char = '\\'; 00942 } 00943 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { 00944 // These characters need to be escaped in strings 00945 // Clean up the escape sequence to avoid parsing errors by clients 00946 return '\\' . dechex( ord( $char ) ) . ' '; 00947 } else { 00948 // Decode unnecessary escape 00949 return $char; 00950 } 00951 } 00952 00972 static function fixTagAttributes( $text, $element ) { 00973 if( trim( $text ) == '' ) { 00974 return ''; 00975 } 00976 00977 $decoded = Sanitizer::decodeTagAttributes( $text ); 00978 $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); 00979 00980 $attribs = array(); 00981 foreach( $stripped as $attribute => $value ) { 00982 $encAttribute = htmlspecialchars( $attribute ); 00983 $encValue = Sanitizer::safeEncodeAttribute( $value ); 00984 00985 $attribs[] = "$encAttribute=\"$encValue\""; 00986 } 00987 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; 00988 } 00989 00995 static function encodeAttribute( $text ) { 00996 $encValue = htmlspecialchars( $text, ENT_QUOTES ); 00997 00998 // Whitespace is normalized during attribute decoding, 00999 // so if we've been passed non-spaces we must encode them 01000 // ahead of time or they won't be preserved. 01001 $encValue = strtr( $encValue, array( 01002 "\n" => ' ', 01003 "\r" => ' ', 01004 "\t" => '	', 01005 ) ); 01006 01007 return $encValue; 01008 } 01009 01016 static function safeEncodeAttribute( $text ) { 01017 $encValue = Sanitizer::encodeAttribute( $text ); 01018 01019 # Templates and links may be expanded in later parsing, 01020 # creating invalid or dangerous output. Suppress this. 01021 $encValue = strtr( $encValue, array( 01022 '<' => '<', // This should never happen, 01023 '>' => '>', // we've received invalid input 01024 '"' => '"', // which should have been escaped. 01025 '{' => '{', 01026 '[' => '[', 01027 "''" => '''', 01028 'ISBN' => 'ISBN', 01029 'RFC' => 'RFC', 01030 'PMID' => 'PMID', 01031 '|' => '|', 01032 '__' => '__', 01033 ) ); 01034 01035 # Stupid hack 01036 $encValue = preg_replace_callback( 01037 '/((?i)' . wfUrlProtocols() . ')/', 01038 array( 'Sanitizer', 'armorLinksCallback' ), 01039 $encValue ); 01040 return $encValue; 01041 } 01042 01074 static function escapeId( $id, $options = array() ) { 01075 global $wgHtml5, $wgExperimentalHtmlIds; 01076 $options = (array)$options; 01077 01078 if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { 01079 $id = Sanitizer::decodeCharReferences( $id ); 01080 $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); 01081 $id = trim( $id, '_' ); 01082 if ( $id === '' ) { 01083 # Must have been all whitespace to start with. 01084 return '_'; 01085 } else { 01086 return $id; 01087 } 01088 } 01089 01090 # HTML4-style escaping 01091 static $replace = array( 01092 '%3A' => ':', 01093 '%' => '.' 01094 ); 01095 01096 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); 01097 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); 01098 01099 if ( !preg_match( '/^[a-zA-Z]/', $id ) 01100 && !in_array( 'noninitial', $options ) ) { 01101 // Initial character must be a letter! 01102 $id = "x$id"; 01103 } 01104 return $id; 01105 } 01106 01118 static function escapeClass( $class ) { 01119 // Convert ugly stuff to underscores and kill underscores in ugly places 01120 return rtrim( preg_replace( 01121 array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ), 01122 '_', 01123 $class ), '_' ); 01124 } 01125 01133 static function escapeHtmlAllowEntities( $html ) { 01134 $html = Sanitizer::decodeCharReferences( $html ); 01135 # It seems wise to escape ' as well as ", as a matter of course. Can't 01136 # hurt. 01137 $html = htmlspecialchars( $html, ENT_QUOTES ); 01138 return $html; 01139 } 01140 01146 private static function armorLinksCallback( $matches ) { 01147 return str_replace( ':', ':', $matches[1] ); 01148 } 01149 01158 public static function decodeTagAttributes( $text ) { 01159 if( trim( $text ) == '' ) { 01160 return array(); 01161 } 01162 01163 $attribs = array(); 01164 $pairs = array(); 01165 if( !preg_match_all( 01166 self::getAttribsRegex(), 01167 $text, 01168 $pairs, 01169 PREG_SET_ORDER ) ) { 01170 return $attribs; 01171 } 01172 01173 foreach( $pairs as $set ) { 01174 $attribute = strtolower( $set[1] ); 01175 $value = Sanitizer::getTagAttributeCallback( $set ); 01176 01177 // Normalize whitespace 01178 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); 01179 $value = trim( $value ); 01180 01181 // Decode character references 01182 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); 01183 } 01184 return $attribs; 01185 } 01186 01195 private static function getTagAttributeCallback( $set ) { 01196 if( isset( $set[6] ) ) { 01197 # Illegal #XXXXXX color with no quotes. 01198 return $set[6]; 01199 } elseif( isset( $set[5] ) ) { 01200 # No quotes. 01201 return $set[5]; 01202 } elseif( isset( $set[4] ) ) { 01203 # Single-quoted 01204 return $set[4]; 01205 } elseif( isset( $set[3] ) ) { 01206 # Double-quoted 01207 return $set[3]; 01208 } elseif( !isset( $set[2] ) ) { 01209 # In XHTML, attributes must have a value. 01210 # For 'reduced' form, return explicitly the attribute name here. 01211 return $set[1]; 01212 } else { 01213 throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); 01214 } 01215 } 01216 01228 private static function normalizeAttributeValue( $text ) { 01229 return str_replace( '"', '"', 01230 self::normalizeWhitespace( 01231 Sanitizer::normalizeCharReferences( $text ) ) ); 01232 } 01233 01238 private static function normalizeWhitespace( $text ) { 01239 return preg_replace( 01240 '/\r\n|[\x20\x0d\x0a\x09]/', 01241 ' ', 01242 $text ); 01243 } 01244 01253 static function normalizeSectionNameWhitespace( $section ) { 01254 return trim( preg_replace( '/[ _]+/', ' ', $section ) ); 01255 } 01256 01272 static function normalizeCharReferences( $text ) { 01273 return preg_replace_callback( 01274 self::CHAR_REFS_REGEX, 01275 array( 'Sanitizer', 'normalizeCharReferencesCallback' ), 01276 $text ); 01277 } 01282 static function normalizeCharReferencesCallback( $matches ) { 01283 $ret = null; 01284 if( $matches[1] != '' ) { 01285 $ret = Sanitizer::normalizeEntity( $matches[1] ); 01286 } elseif( $matches[2] != '' ) { 01287 $ret = Sanitizer::decCharReference( $matches[2] ); 01288 } elseif( $matches[3] != '' ) { 01289 $ret = Sanitizer::hexCharReference( $matches[3] ); 01290 } 01291 if( is_null( $ret ) ) { 01292 return htmlspecialchars( $matches[0] ); 01293 } else { 01294 return $ret; 01295 } 01296 } 01297 01308 static function normalizeEntity( $name ) { 01309 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01310 return '&' . self::$htmlEntityAliases[$name] . ';'; 01311 } elseif ( in_array( $name, 01312 array( 'lt', 'gt', 'amp', 'quot' ) ) ) { 01313 return "&$name;"; 01314 } elseif ( isset( self::$htmlEntities[$name] ) ) { 01315 return '&#' . self::$htmlEntities[$name] . ';'; 01316 } else { 01317 return "&$name;"; 01318 } 01319 } 01320 01325 static function decCharReference( $codepoint ) { 01326 $point = intval( $codepoint ); 01327 if( Sanitizer::validateCodepoint( $point ) ) { 01328 return sprintf( '&#%d;', $point ); 01329 } else { 01330 return null; 01331 } 01332 } 01333 01338 static function hexCharReference( $codepoint ) { 01339 $point = hexdec( $codepoint ); 01340 if( Sanitizer::validateCodepoint( $point ) ) { 01341 return sprintf( '&#x%x;', $point ); 01342 } else { 01343 return null; 01344 } 01345 } 01346 01352 private static function validateCodepoint( $codepoint ) { 01353 return ($codepoint == 0x09) 01354 || ($codepoint == 0x0a) 01355 || ($codepoint == 0x0d) 01356 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) 01357 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) 01358 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); 01359 } 01360 01368 public static function decodeCharReferences( $text ) { 01369 return preg_replace_callback( 01370 self::CHAR_REFS_REGEX, 01371 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01372 $text ); 01373 } 01374 01385 public static function decodeCharReferencesAndNormalize( $text ) { 01386 global $wgContLang; 01387 $text = preg_replace_callback( 01388 self::CHAR_REFS_REGEX, 01389 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01390 $text, /* limit */ -1, $count ); 01391 01392 if ( $count ) { 01393 return $wgContLang->normalize( $text ); 01394 } else { 01395 return $text; 01396 } 01397 } 01398 01403 static function decodeCharReferencesCallback( $matches ) { 01404 if( $matches[1] != '' ) { 01405 return Sanitizer::decodeEntity( $matches[1] ); 01406 } elseif( $matches[2] != '' ) { 01407 return Sanitizer::decodeChar( intval( $matches[2] ) ); 01408 } elseif( $matches[3] != '' ) { 01409 return Sanitizer::decodeChar( hexdec( $matches[3] ) ); 01410 } 01411 # Last case should be an ampersand by itself 01412 return $matches[0]; 01413 } 01414 01422 static function decodeChar( $codepoint ) { 01423 if( Sanitizer::validateCodepoint( $codepoint ) ) { 01424 return codepointToUtf8( $codepoint ); 01425 } else { 01426 return UTF8_REPLACEMENT; 01427 } 01428 } 01429 01438 static function decodeEntity( $name ) { 01439 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01440 $name = self::$htmlEntityAliases[$name]; 01441 } 01442 if( isset( self::$htmlEntities[$name] ) ) { 01443 return codepointToUtf8( self::$htmlEntities[$name] ); 01444 } else { 01445 return "&$name;"; 01446 } 01447 } 01448 01455 static function attributeWhitelist( $element ) { 01456 $list = Sanitizer::setupAttributeWhitelist(); 01457 return isset( $list[$element] ) 01458 ? $list[$element] 01459 : array(); 01460 } 01461 01467 static function setupAttributeWhitelist() { 01468 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes; 01469 01470 static $whitelist, $staticInitialised; 01471 $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) ); 01472 01473 if ( isset( $whitelist ) && $staticInitialised == $globalContext ) { 01474 return $whitelist; 01475 } 01476 01477 $common = array( 01478 # HTML 01479 'id', 01480 'class', 01481 'style', 01482 'lang', 01483 'dir', 01484 'title', 01485 01486 # WAI-ARIA 01487 'role', 01488 ); 01489 01490 if ( $wgAllowRdfaAttributes ) { 01491 #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 01492 $common = array_merge( $common, array( 01493 'about', 'property', 'resource', 'datatype', 'typeof', 01494 ) ); 01495 } 01496 01497 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) { 01498 # add HTML5 microdata tags as specified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model 01499 $common = array_merge( $common, array( 01500 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype' 01501 ) ); 01502 } 01503 01504 $block = array_merge( $common, array( 'align' ) ); 01505 $tablealign = array( 'align', 'char', 'charoff', 'valign' ); 01506 $tablecell = array( 'abbr', 01507 'axis', 01508 'headers', 01509 'scope', 01510 'rowspan', 01511 'colspan', 01512 'nowrap', # deprecated 01513 'width', # deprecated 01514 'height', # deprecated 01515 'bgcolor' # deprecated 01516 ); 01517 01518 # Numbers refer to sections in HTML 4.01 standard describing the element. 01519 # See: http://www.w3.org/TR/html4/ 01520 $whitelist = array( 01521 # 7.5.4 01522 'div' => $block, 01523 'center' => $common, # deprecated 01524 'span' => $block, # ?? 01525 01526 # 7.5.5 01527 'h1' => $block, 01528 'h2' => $block, 01529 'h3' => $block, 01530 'h4' => $block, 01531 'h5' => $block, 01532 'h6' => $block, 01533 01534 # 7.5.6 01535 # address 01536 01537 # 8.2.4 01538 # bdo 01539 01540 # 9.2.1 01541 'em' => $common, 01542 'strong' => $common, 01543 'cite' => $common, 01544 'dfn' => $common, 01545 'code' => $common, 01546 'samp' => $common, 01547 'kbd' => $common, 01548 'var' => $common, 01549 'abbr' => $common, 01550 # acronym 01551 01552 # 9.2.2 01553 'blockquote' => array_merge( $common, array( 'cite' ) ), 01554 # q 01555 01556 # 9.2.3 01557 'sub' => $common, 01558 'sup' => $common, 01559 01560 # 9.3.1 01561 'p' => $block, 01562 01563 # 9.3.2 01564 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), 01565 01566 # 9.3.4 01567 'pre' => array_merge( $common, array( 'width' ) ), 01568 01569 # 9.4 01570 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), 01571 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), 01572 01573 # 10.2 01574 'ul' => array_merge( $common, array( 'type' ) ), 01575 'ol' => array_merge( $common, array( 'type', 'start' ) ), 01576 'li' => array_merge( $common, array( 'type', 'value' ) ), 01577 01578 # 10.3 01579 'dl' => $common, 01580 'dd' => $common, 01581 'dt' => $common, 01582 01583 # 11.2.1 01584 'table' => array_merge( $common, 01585 array( 'summary', 'width', 'border', 'frame', 01586 'rules', 'cellspacing', 'cellpadding', 01587 'align', 'bgcolor', 01588 ) ), 01589 01590 # 11.2.2 01591 'caption' => array_merge( $common, array( 'align' ) ), 01592 01593 # 11.2.3 01594 'thead' => array_merge( $common, $tablealign ), 01595 'tfoot' => array_merge( $common, $tablealign ), 01596 'tbody' => array_merge( $common, $tablealign ), 01597 01598 # 11.2.4 01599 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), 01600 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), 01601 01602 # 11.2.5 01603 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), 01604 01605 # 11.2.6 01606 'td' => array_merge( $common, $tablecell, $tablealign ), 01607 'th' => array_merge( $common, $tablecell, $tablealign ), 01608 01609 # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object 01610 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa 01611 01612 # 13.2 01613 # Not usually allowed, but may be used for extension-style hooks 01614 # such as <math> when it is rasterized, or if $wgAllowImageTag is 01615 # true 01616 'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ), 01617 01618 # 15.2.1 01619 'tt' => $common, 01620 'b' => $common, 01621 'i' => $common, 01622 'big' => $common, 01623 'small' => $common, 01624 'strike' => $common, 01625 's' => $common, 01626 'u' => $common, 01627 01628 # 15.2.2 01629 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), 01630 # basefont 01631 01632 # 15.3 01633 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), 01634 01635 # XHTML Ruby annotation text module, simple ruby only. 01636 # http://www.w3c.org/TR/ruby/ 01637 'ruby' => $common, 01638 # rbc 01639 # rtc 01640 'rb' => $common, 01641 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), 01642 'rp' => $common, 01643 01644 # MathML root element, where used for extensions 01645 # 'title' may not be 100% valid here; it's XHTML 01646 # http://www.w3.org/TR/REC-MathML/ 01647 'math' => array( 'class', 'style', 'id', 'title' ), 01648 01649 # HTML 5 section 4.6 01650 'bdi' => $common, 01651 01652 ); 01653 01654 if ( $wgHtml5 ) { 01655 # HTML5 elements, defined by: 01656 # http://www.whatwg.org/specs/web-apps/current-work/multipage/ 01657 $whitelist += array( 01658 'data' => array_merge( $common, array( 'value' ) ), 01659 'time' => array_merge( $common, array( 'datetime' ) ), 01660 'mark' => $common, 01661 01662 // meta and link are only permitted by removeHTMLtags when Microdata 01663 // is enabled so we don't bother adding a conditional to hide these 01664 // Also meta and link are only valid in WikiText as Microdata elements 01665 // (ie: validateTag rejects tags missing the attributes needed for Microdata) 01666 // So we don't bother including $common attributes that have no purpose. 01667 'meta' => array( 'itemprop', 'content' ), 01668 'link' => array( 'itemprop', 'href' ), 01669 ); 01670 } 01671 01672 $staticInitialised = $globalContext; 01673 01674 return $whitelist; 01675 } 01676 01687 static function stripAllTags( $text ) { 01688 # Actual <tags> 01689 $text = StringUtils::delimiterReplace( '<', '>', '', $text ); 01690 01691 # Normalize &entities and whitespace 01692 $text = self::decodeCharReferences( $text ); 01693 $text = self::normalizeWhitespace( $text ); 01694 01695 return $text; 01696 } 01697 01707 static function hackDocType() { 01708 $out = "<!DOCTYPE html [\n"; 01709 foreach( self::$htmlEntities as $entity => $codepoint ) { 01710 $out .= "<!ENTITY $entity \"&#$codepoint;\">"; 01711 } 01712 $out .= "]>\n"; 01713 return $out; 01714 } 01715 01720 static function cleanUrl( $url ) { 01721 # Normalize any HTML entities in input. They will be 01722 # re-escaped by makeExternalLink(). 01723 $url = Sanitizer::decodeCharReferences( $url ); 01724 01725 # Escape any control characters introduced by the above step 01726 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', 01727 array( __CLASS__, 'cleanUrlCallback' ), $url ); 01728 01729 # Validate hostname portion 01730 $matches = array(); 01731 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { 01732 list( /* $whole */, $protocol, $host, $rest ) = $matches; 01733 01734 // Characters that will be ignored in IDNs. 01735 // http://tools.ietf.org/html/3454#section-3.1 01736 // Strip them before further processing so blacklists and such work. 01737 $strip = "/ 01738 \\s| # general whitespace 01739 \xc2\xad| # 00ad SOFT HYPHEN 01740 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 01741 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 01742 \xe2\x81\xa0| # 2060 WORD JOINER 01743 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 01744 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER 01745 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 01746 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 01747 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 01748 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 01749 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 01750 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16 01751 /xuD"; 01752 01753 $host = preg_replace( $strip, '', $host ); 01754 01755 // @todo FIXME: Validate hostnames here 01756 01757 return $protocol . $host . $rest; 01758 } else { 01759 return $url; 01760 } 01761 } 01762 01767 static function cleanUrlCallback( $matches ) { 01768 return urlencode( $matches[0] ); 01769 } 01770 01799 public static function validateEmail( $addr ) { 01800 $result = null; 01801 if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) { 01802 return $result; 01803 } 01804 01805 // Please note strings below are enclosed in brackets [], this make the 01806 // hyphen "-" a range indicator. Hence it is double backslashed below. 01807 // See bug 26948 01808 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~"; 01809 $rfc1034_ldh_str = "a-z0-9\\-"; 01810 01811 $HTML5_email_regexp = "/ 01812 ^ # start of string 01813 [$rfc5322_atext\\.]+ # user part which is liberal :p 01814 @ # 'apostrophe' 01815 [$rfc1034_ldh_str]+ # First domain part 01816 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot 01817 $ # End of string 01818 /ix"; // case Insensitive, eXtended 01819 01820 return (bool) preg_match( $HTML5_email_regexp, $addr ); 01821 } 01822 }