MediaWiki
REL1_23
|
00001 <?php 00031 class Sanitizer { 00036 const CHAR_REFS_REGEX = 00037 '/&([A-Za-z0-9\x80-\xff]+); 00038 |&\#([0-9]+); 00039 |&\#[xX]([0-9A-Fa-f]+); 00040 |(&)/x'; 00041 00050 const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; 00051 const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; 00052 00058 private static $htmlEntities = array( 00059 'Aacute' => 193, 00060 'aacute' => 225, 00061 'Acirc' => 194, 00062 'acirc' => 226, 00063 'acute' => 180, 00064 'AElig' => 198, 00065 'aelig' => 230, 00066 'Agrave' => 192, 00067 'agrave' => 224, 00068 'alefsym' => 8501, 00069 'Alpha' => 913, 00070 'alpha' => 945, 00071 'amp' => 38, 00072 'and' => 8743, 00073 'ang' => 8736, 00074 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. 00075 'Aring' => 197, 00076 'aring' => 229, 00077 'asymp' => 8776, 00078 'Atilde' => 195, 00079 'atilde' => 227, 00080 'Auml' => 196, 00081 'auml' => 228, 00082 'bdquo' => 8222, 00083 'Beta' => 914, 00084 'beta' => 946, 00085 'brvbar' => 166, 00086 'bull' => 8226, 00087 'cap' => 8745, 00088 'Ccedil' => 199, 00089 'ccedil' => 231, 00090 'cedil' => 184, 00091 'cent' => 162, 00092 'Chi' => 935, 00093 'chi' => 967, 00094 'circ' => 710, 00095 'clubs' => 9827, 00096 'cong' => 8773, 00097 'copy' => 169, 00098 'crarr' => 8629, 00099 'cup' => 8746, 00100 'curren' => 164, 00101 'dagger' => 8224, 00102 'Dagger' => 8225, 00103 'darr' => 8595, 00104 'dArr' => 8659, 00105 'deg' => 176, 00106 'Delta' => 916, 00107 'delta' => 948, 00108 'diams' => 9830, 00109 'divide' => 247, 00110 'Eacute' => 201, 00111 'eacute' => 233, 00112 'Ecirc' => 202, 00113 'ecirc' => 234, 00114 'Egrave' => 200, 00115 'egrave' => 232, 00116 'empty' => 8709, 00117 'emsp' => 8195, 00118 'ensp' => 8194, 00119 'Epsilon' => 917, 00120 'epsilon' => 949, 00121 'equiv' => 8801, 00122 'Eta' => 919, 00123 'eta' => 951, 00124 'ETH' => 208, 00125 'eth' => 240, 00126 'Euml' => 203, 00127 'euml' => 235, 00128 'euro' => 8364, 00129 'exist' => 8707, 00130 'fnof' => 402, 00131 'forall' => 8704, 00132 'frac12' => 189, 00133 'frac14' => 188, 00134 'frac34' => 190, 00135 'frasl' => 8260, 00136 'Gamma' => 915, 00137 'gamma' => 947, 00138 'ge' => 8805, 00139 'gt' => 62, 00140 'harr' => 8596, 00141 'hArr' => 8660, 00142 'hearts' => 9829, 00143 'hellip' => 8230, 00144 'Iacute' => 205, 00145 'iacute' => 237, 00146 'Icirc' => 206, 00147 'icirc' => 238, 00148 'iexcl' => 161, 00149 'Igrave' => 204, 00150 'igrave' => 236, 00151 'image' => 8465, 00152 'infin' => 8734, 00153 'int' => 8747, 00154 'Iota' => 921, 00155 'iota' => 953, 00156 'iquest' => 191, 00157 'isin' => 8712, 00158 'Iuml' => 207, 00159 'iuml' => 239, 00160 'Kappa' => 922, 00161 'kappa' => 954, 00162 'Lambda' => 923, 00163 'lambda' => 955, 00164 'lang' => 9001, 00165 'laquo' => 171, 00166 'larr' => 8592, 00167 'lArr' => 8656, 00168 'lceil' => 8968, 00169 'ldquo' => 8220, 00170 'le' => 8804, 00171 'lfloor' => 8970, 00172 'lowast' => 8727, 00173 'loz' => 9674, 00174 'lrm' => 8206, 00175 'lsaquo' => 8249, 00176 'lsquo' => 8216, 00177 'lt' => 60, 00178 'macr' => 175, 00179 'mdash' => 8212, 00180 'micro' => 181, 00181 'middot' => 183, 00182 'minus' => 8722, 00183 'Mu' => 924, 00184 'mu' => 956, 00185 'nabla' => 8711, 00186 'nbsp' => 160, 00187 'ndash' => 8211, 00188 'ne' => 8800, 00189 'ni' => 8715, 00190 'not' => 172, 00191 'notin' => 8713, 00192 'nsub' => 8836, 00193 'Ntilde' => 209, 00194 'ntilde' => 241, 00195 'Nu' => 925, 00196 'nu' => 957, 00197 'Oacute' => 211, 00198 'oacute' => 243, 00199 'Ocirc' => 212, 00200 'ocirc' => 244, 00201 'OElig' => 338, 00202 'oelig' => 339, 00203 'Ograve' => 210, 00204 'ograve' => 242, 00205 'oline' => 8254, 00206 'Omega' => 937, 00207 'omega' => 969, 00208 'Omicron' => 927, 00209 'omicron' => 959, 00210 'oplus' => 8853, 00211 'or' => 8744, 00212 'ordf' => 170, 00213 'ordm' => 186, 00214 'Oslash' => 216, 00215 'oslash' => 248, 00216 'Otilde' => 213, 00217 'otilde' => 245, 00218 'otimes' => 8855, 00219 'Ouml' => 214, 00220 'ouml' => 246, 00221 'para' => 182, 00222 'part' => 8706, 00223 'permil' => 8240, 00224 'perp' => 8869, 00225 'Phi' => 934, 00226 'phi' => 966, 00227 'Pi' => 928, 00228 'pi' => 960, 00229 'piv' => 982, 00230 'plusmn' => 177, 00231 'pound' => 163, 00232 'prime' => 8242, 00233 'Prime' => 8243, 00234 'prod' => 8719, 00235 'prop' => 8733, 00236 'Psi' => 936, 00237 'psi' => 968, 00238 'quot' => 34, 00239 'radic' => 8730, 00240 'rang' => 9002, 00241 'raquo' => 187, 00242 'rarr' => 8594, 00243 'rArr' => 8658, 00244 'rceil' => 8969, 00245 'rdquo' => 8221, 00246 'real' => 8476, 00247 'reg' => 174, 00248 'rfloor' => 8971, 00249 'Rho' => 929, 00250 'rho' => 961, 00251 'rlm' => 8207, 00252 'rsaquo' => 8250, 00253 'rsquo' => 8217, 00254 'sbquo' => 8218, 00255 'Scaron' => 352, 00256 'scaron' => 353, 00257 'sdot' => 8901, 00258 'sect' => 167, 00259 'shy' => 173, 00260 'Sigma' => 931, 00261 'sigma' => 963, 00262 'sigmaf' => 962, 00263 'sim' => 8764, 00264 'spades' => 9824, 00265 'sub' => 8834, 00266 'sube' => 8838, 00267 'sum' => 8721, 00268 'sup' => 8835, 00269 'sup1' => 185, 00270 'sup2' => 178, 00271 'sup3' => 179, 00272 'supe' => 8839, 00273 'szlig' => 223, 00274 'Tau' => 932, 00275 'tau' => 964, 00276 'there4' => 8756, 00277 'Theta' => 920, 00278 'theta' => 952, 00279 'thetasym' => 977, 00280 'thinsp' => 8201, 00281 'THORN' => 222, 00282 'thorn' => 254, 00283 'tilde' => 732, 00284 'times' => 215, 00285 'trade' => 8482, 00286 'Uacute' => 218, 00287 'uacute' => 250, 00288 'uarr' => 8593, 00289 'uArr' => 8657, 00290 'Ucirc' => 219, 00291 'ucirc' => 251, 00292 'Ugrave' => 217, 00293 'ugrave' => 249, 00294 'uml' => 168, 00295 'upsih' => 978, 00296 'Upsilon' => 933, 00297 'upsilon' => 965, 00298 'Uuml' => 220, 00299 'uuml' => 252, 00300 'weierp' => 8472, 00301 'Xi' => 926, 00302 'xi' => 958, 00303 'Yacute' => 221, 00304 'yacute' => 253, 00305 'yen' => 165, 00306 'Yuml' => 376, 00307 'yuml' => 255, 00308 'Zeta' => 918, 00309 'zeta' => 950, 00310 'zwj' => 8205, 00311 'zwnj' => 8204 00312 ); 00313 00317 private static $htmlEntityAliases = array( 00318 'רלמ' => 'rlm', 00319 'رلم' => 'rlm', 00320 ); 00321 00325 private static $attribsRegex; 00326 00332 static function getAttribsRegex() { 00333 if ( self::$attribsRegex === null ) { 00334 $attribFirst = '[:A-Z_a-z0-9]'; 00335 $attrib = '[:A-Z_a-z-.0-9]'; 00336 $space = '[\x09\x0a\x0d\x20]'; 00337 self::$attribsRegex = 00338 "/(?:^|$space)({$attribFirst}{$attrib}*) 00339 ($space*=$space* 00340 (?: 00341 # The attribute value: quoted or alone 00342 \"([^<\"]*)\" 00343 | '([^<']*)' 00344 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) 00345 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of 00346 # colors are specified like this. 00347 # We'll be normalizing it. 00348 ) 00349 )?(?=$space|\$)/sx"; 00350 } 00351 return self::$attribsRegex; 00352 } 00353 00366 static function removeHTMLtags( $text, $processCallback = null, 00367 $args = array(), $extratags = array(), $removetags = array() 00368 ) { 00369 global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag; 00370 00371 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, 00372 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; 00373 00374 wfProfileIn( __METHOD__ ); 00375 00376 // Base our staticInitialised variable off of the global config state so that if the globals 00377 // are changed (like in the screwed up test system) we will re-initialise the settings. 00378 $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) ); 00379 if ( !$staticInitialised || $staticInitialised != $globalContext ) { 00380 00381 $htmlpairsStatic = array( # Tags that must be closed 00382 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 00383 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 00384 'strike', 'strong', 'tt', 'var', 'div', 'center', 00385 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 00386 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn', 00387 'kbd', 'samp', 'data', 'time', 'mark' 00388 ); 00389 $htmlsingle = array( 00390 'br', 'wbr', 'hr', 'li', 'dt', 'dd' 00391 ); 00392 $htmlsingleonly = array( # Elements that cannot have close tags 00393 'br', 'wbr', 'hr' 00394 ); 00395 if ( $wgAllowMicrodataAttributes ) { 00396 $htmlsingle[] = $htmlsingleonly[] = 'meta'; 00397 $htmlsingle[] = $htmlsingleonly[] = 'link'; 00398 } 00399 $htmlnest = array( # Tags that can be nested--?? 00400 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 00401 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', 00402 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' 00403 ); 00404 $tabletags = array( # Can only appear inside table, we will close them 00405 'td', 'th', 'tr', 00406 ); 00407 $htmllist = array( # Tags used by list 00408 'ul', 'ol', 00409 ); 00410 $listtags = array( # Tags that can appear in a list 00411 'li', 00412 ); 00413 00414 if ( $wgAllowImageTag ) { 00415 $htmlsingle[] = 'img'; 00416 $htmlsingleonly[] = 'img'; 00417 } 00418 00419 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); 00420 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); 00421 00422 # Convert them all to hashtables for faster lookup 00423 $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 00424 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ); 00425 foreach ( $vars as $var ) { 00426 $$var = array_flip( $$var ); 00427 } 00428 $staticInitialised = $globalContext; 00429 } 00430 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays 00431 $extratags = array_flip( $extratags ); 00432 $removetags = array_flip( $removetags ); 00433 $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); 00434 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); 00435 00436 # Remove HTML comments 00437 $text = Sanitizer::removeHTMLcomments( $text ); 00438 $bits = explode( '<', $text ); 00439 $text = str_replace( '>', '>', array_shift( $bits ) ); 00440 if ( !$wgUseTidy ) { 00441 $tagstack = $tablestack = array(); 00442 foreach ( $bits as $x ) { 00443 $regs = array(); 00444 # $slash: Does the current element start with a '/'? 00445 # $t: Current element name 00446 # $params: String between element name and > 00447 # $brace: Ending '>' or '/>' 00448 # $rest: Everything until the next element of $bits 00449 if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { 00450 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00451 } else { 00452 $slash = $t = $params = $brace = $rest = null; 00453 } 00454 00455 $badtag = false; 00456 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00457 # Check our stack 00458 if ( $slash && isset( $htmlsingleonly[$t] ) ) { 00459 $badtag = true; 00460 } elseif ( $slash ) { 00461 # Closing a tag... is it the one we just opened? 00462 $ot = @array_pop( $tagstack ); 00463 if ( $ot != $t ) { 00464 if ( isset( $htmlsingleallowed[$ot] ) ) { 00465 # Pop all elements with an optional close tag 00466 # and see if we find a match below them 00467 $optstack = array(); 00468 array_push( $optstack, $ot ); 00469 wfSuppressWarnings(); 00470 $ot = array_pop( $tagstack ); 00471 wfRestoreWarnings(); 00472 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { 00473 array_push( $optstack, $ot ); 00474 wfSuppressWarnings(); 00475 $ot = array_pop( $tagstack ); 00476 wfRestoreWarnings(); 00477 } 00478 if ( $t != $ot ) { 00479 # No match. Push the optional elements back again 00480 $badtag = true; 00481 wfSuppressWarnings(); 00482 $ot = array_pop( $optstack ); 00483 wfRestoreWarnings(); 00484 while ( $ot ) { 00485 array_push( $tagstack, $ot ); 00486 wfSuppressWarnings(); 00487 $ot = array_pop( $optstack ); 00488 wfRestoreWarnings(); 00489 } 00490 } 00491 } else { 00492 @array_push( $tagstack, $ot ); 00493 # <li> can be nested in <ul> or <ol>, skip those cases: 00494 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) { 00495 $badtag = true; 00496 } 00497 } 00498 } else { 00499 if ( $t == 'table' ) { 00500 $tagstack = array_pop( $tablestack ); 00501 } 00502 } 00503 $newparams = ''; 00504 } else { 00505 # Keep track for later 00506 if ( isset( $tabletags[$t] ) && 00507 !in_array( 'table', $tagstack ) ) { 00508 $badtag = true; 00509 } elseif ( in_array( $t, $tagstack ) && 00510 !isset( $htmlnest[$t] ) ) { 00511 $badtag = true; 00512 # Is it a self closed htmlpair ? (bug 5487) 00513 } elseif ( $brace == '/>' && 00514 isset( $htmlpairs[$t] ) ) { 00515 $badtag = true; 00516 } elseif ( isset( $htmlsingleonly[$t] ) ) { 00517 # Hack to force empty tag for unclosable elements 00518 $brace = '/>'; 00519 } elseif ( isset( $htmlsingle[$t] ) ) { 00520 # Hack to not close $htmlsingle tags 00521 $brace = null; 00522 # Still need to push this optionally-closed tag to 00523 # the tag stack so that we can match end tags 00524 # instead of marking them as bad. 00525 array_push( $tagstack, $t ); 00526 } elseif ( isset( $tabletags[$t] ) 00527 && in_array( $t, $tagstack ) ) { 00528 // New table tag but forgot to close the previous one 00529 $text .= "</$t>"; 00530 } else { 00531 if ( $t == 'table' ) { 00532 array_push( $tablestack, $tagstack ); 00533 $tagstack = array(); 00534 } 00535 array_push( $tagstack, $t ); 00536 } 00537 00538 # Replace any variables or template parameters with 00539 # plaintext results. 00540 if ( is_callable( $processCallback ) ) { 00541 call_user_func_array( $processCallback, array( &$params, $args ) ); 00542 } 00543 00544 if ( !Sanitizer::validateTag( $params, $t ) ) { 00545 $badtag = true; 00546 } 00547 00548 # Strip non-approved attributes from the tag 00549 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00550 } 00551 if ( !$badtag ) { 00552 $rest = str_replace( '>', '>', $rest ); 00553 $close = ( $brace == '/>' && !$slash ) ? ' /' : ''; 00554 $text .= "<$slash$t$newparams$close>$rest"; 00555 continue; 00556 } 00557 } 00558 $text .= '<' . str_replace( '>', '>', $x ); 00559 } 00560 # Close off any remaining tags 00561 while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) { 00562 $text .= "</$t>\n"; 00563 if ( $t == 'table' ) { 00564 $tagstack = array_pop( $tablestack ); 00565 } 00566 } 00567 } else { 00568 # this might be possible using tidy itself 00569 foreach ( $bits as $x ) { 00570 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', 00571 $x, $regs ); 00572 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00573 $badtag = false; 00574 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00575 if ( is_callable( $processCallback ) ) { 00576 call_user_func_array( $processCallback, array( &$params, $args ) ); 00577 } 00578 00579 if ( !Sanitizer::validateTag( $params, $t ) ) { 00580 $badtag = true; 00581 } 00582 00583 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00584 if ( !$badtag ) { 00585 $rest = str_replace( '>', '>', $rest ); 00586 $text .= "<$slash$t$newparams$brace$rest"; 00587 continue; 00588 } 00589 } 00590 $text .= '<' . str_replace( '>', '>', $x ); 00591 } 00592 } 00593 wfProfileOut( __METHOD__ ); 00594 return $text; 00595 } 00596 00607 static function removeHTMLcomments( $text ) { 00608 wfProfileIn( __METHOD__ ); 00609 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) { 00610 $end = strpos( $text, '-->', $start + 4 ); 00611 if ( $end === false ) { 00612 # Unterminated comment; bail out 00613 break; 00614 } 00615 00616 $end += 3; 00617 00618 # Trim space and newline if the comment is both 00619 # preceded and followed by a newline 00620 $spaceStart = max( $start - 1, 0 ); 00621 $spaceLen = $end - $spaceStart; 00622 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) { 00623 $spaceStart--; 00624 $spaceLen++; 00625 } 00626 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) { 00627 $spaceLen++; 00628 } 00629 if ( substr( $text, $spaceStart, 1 ) === "\n" 00630 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) { 00631 # Remove the comment, leading and trailing 00632 # spaces, and leave only one newline. 00633 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 ); 00634 } else { 00635 # Remove just the comment. 00636 $text = substr_replace( $text, '', $start, $end - $start ); 00637 } 00638 } 00639 wfProfileOut( __METHOD__ ); 00640 return $text; 00641 } 00642 00655 static function validateTag( $params, $element ) { 00656 $params = Sanitizer::decodeTagAttributes( $params ); 00657 00658 if ( $element == 'meta' || $element == 'link' ) { 00659 if ( !isset( $params['itemprop'] ) ) { 00660 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content 00661 return false; 00662 } 00663 if ( $element == 'meta' && !isset( $params['content'] ) ) { 00664 // <meta> must have a content="" for the itemprop 00665 return false; 00666 } 00667 if ( $element == 'link' && !isset( $params['href'] ) ) { 00668 // <link> must have an associated href="" 00669 return false; 00670 } 00671 } 00672 00673 return true; 00674 } 00675 00691 static function validateTagAttributes( $attribs, $element ) { 00692 return Sanitizer::validateAttributes( $attribs, 00693 Sanitizer::attributeWhitelist( $element ) ); 00694 } 00695 00711 static function validateAttributes( $attribs, $whitelist ) { 00712 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes; 00713 00714 $whitelist = array_flip( $whitelist ); 00715 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; 00716 00717 $out = array(); 00718 foreach ( $attribs as $attribute => $value ) { 00719 #allow XML namespace declaration if RDFa is enabled 00720 if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { 00721 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00722 $out[$attribute] = $value; 00723 } 00724 00725 continue; 00726 } 00727 00728 # Allow any attribute beginning with "data-" 00729 if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) { 00730 continue; 00731 } 00732 00733 # Strip javascript "expression" from stylesheets. 00734 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp 00735 if ( $attribute == 'style' ) { 00736 $value = Sanitizer::checkCss( $value ); 00737 } 00738 00739 if ( $attribute === 'id' ) { 00740 $value = Sanitizer::escapeId( $value, 'noninitial' ); 00741 } 00742 00743 # WAI-ARIA 00744 # http://www.w3.org/TR/wai-aria/ 00745 # http://www.whatwg.org/html/elements.html#wai-aria 00746 # For now we only support role="presentation" until we work out what roles should be 00747 # usable by content and we ensure that our code explicitly rejects patterns that 00748 # violate HTML5's ARIA restrictions. 00749 if ( $attribute === 'role' && $value !== 'presentation' ) { 00750 continue; 00751 } 00752 00753 // RDFa and microdata properties allow URLs, URIs and/or CURIs. 00754 // Check them for sanity. 00755 if ( $attribute === 'rel' || $attribute === 'rev' 00756 # RDFa 00757 || $attribute === 'about' || $attribute === 'property' 00758 || $attribute === 'resource' || $attribute === 'datatype' 00759 || $attribute === 'typeof' 00760 # HTML5 microdata 00761 || $attribute === 'itemid' || $attribute === 'itemprop' 00762 || $attribute === 'itemref' || $attribute === 'itemscope' 00763 || $attribute === 'itemtype' 00764 ) { 00765 //Paranoia. Allow "simple" values but suppress javascript 00766 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00767 continue; 00768 } 00769 } 00770 00771 # NOTE: even though elements using href/src are not allowed directly, supply 00772 # validation code that can be used by tag hook handlers, etc 00773 if ( $attribute === 'href' || $attribute === 'src' ) { 00774 if ( !preg_match( $hrefExp, $value ) ) { 00775 continue; //drop any href or src attributes not using an allowed protocol. 00776 // NOTE: this also drops all relative URLs 00777 } 00778 } 00779 00780 // If this attribute was previously set, override it. 00781 // Output should only have one attribute of each name. 00782 $out[$attribute] = $value; 00783 } 00784 00785 if ( $wgAllowMicrodataAttributes ) { 00786 # itemtype, itemid, itemref don't make sense without itemscope 00787 if ( !array_key_exists( 'itemscope', $out ) ) { 00788 unset( $out['itemtype'] ); 00789 unset( $out['itemid'] ); 00790 unset( $out['itemref'] ); 00791 } 00792 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. 00793 } 00794 return $out; 00795 } 00796 00807 static function mergeAttributes( $a, $b ) { 00808 $out = array_merge( $a, $b ); 00809 if ( isset( $a['class'] ) && isset( $b['class'] ) 00810 && is_string( $a['class'] ) && is_string( $b['class'] ) 00811 && $a['class'] !== $b['class'] 00812 ) { 00813 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", 00814 -1, PREG_SPLIT_NO_EMPTY ); 00815 $out['class'] = implode( ' ', array_unique( $classes ) ); 00816 } 00817 return $out; 00818 } 00819 00829 public static function normalizeCss( $value ) { 00830 00831 // Decode character references like { 00832 $value = Sanitizer::decodeCharReferences( $value ); 00833 00834 // Decode escape sequences and line continuation 00835 // See the grammar in the CSS 2 spec, appendix D. 00836 // This has to be done AFTER decoding character references. 00837 // This means it isn't possible for this function to return 00838 // unsanitized escape sequences. It is possible to manufacture 00839 // input that contains character references that decode to 00840 // escape sequences that decode to character references, but 00841 // it's OK for the return value to contain character references 00842 // because the caller is supposed to escape those anyway. 00843 static $decodeRegex; 00844 if ( !$decodeRegex ) { 00845 $space = '[\\x20\\t\\r\\n\\f]'; 00846 $nl = '(?:\\n|\\r\\n|\\r|\\f)'; 00847 $backslash = '\\\\'; 00848 $decodeRegex = "/ $backslash 00849 (?: 00850 ($nl) | # 1. Line continuation 00851 ([0-9A-Fa-f]{1,6})$space? | # 2. character number 00852 (.) | # 3. backslash cancelling special meaning 00853 () | # 4. backslash at end of string 00854 )/xu"; 00855 } 00856 $value = preg_replace_callback( $decodeRegex, 00857 array( __CLASS__, 'cssDecodeCallback' ), $value ); 00858 00859 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii 00860 $value = preg_replace_callback( 00861 '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088) 00862 function ( $matches ) { 00863 $cp = utf8ToCodepoint( $matches[0] ); 00864 if ( $cp === false ) { 00865 return ''; 00866 } 00867 return chr( $cp - 65248 ); // ASCII range \x21-\x7A 00868 }, 00869 $value 00870 ); 00871 00872 // Convert more characters IE6 might treat as ascii 00873 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D 00874 $value = str_replace( 00875 array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ), 00876 array( 'r', 'n', 'n', 'l', 'i', '(', '(' ), 00877 $value 00878 ); 00879 00880 // Let the value through if it's nothing but a single comment, to 00881 // allow other functions which may reject it to pass some error 00882 // message through. 00883 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) { 00884 // Remove any comments; IE gets token splitting wrong 00885 // This must be done AFTER decoding character references and 00886 // escape sequences, because those steps can introduce comments 00887 // This step cannot introduce character references or escape 00888 // sequences, because it replaces comments with spaces rather 00889 // than removing them completely. 00890 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); 00891 00892 // Remove anything after a comment-start token, to guard against 00893 // incorrect client implementations. 00894 $commentPos = strpos( $value, '/*' ); 00895 if ( $commentPos !== false ) { 00896 $value = substr( $value, 0, $commentPos ); 00897 } 00898 } 00899 00900 // S followed by repeat, iteration, or prolonged sound marks, 00901 // which IE will treat as "ss" 00902 $value = preg_replace( 00903 '/s(?: 00904 \xE3\x80\xB1 | # U+3031 00905 \xE3\x82\x9D | # U+309D 00906 \xE3\x83\xBC | # U+30FC 00907 \xE3\x83\xBD | # U+30FD 00908 \xEF\xB9\xBC | # U+FE7C 00909 \xEF\xB9\xBD | # U+FE7D 00910 \xEF\xBD\xB0 # U+FF70 00911 )/ix', 00912 'ss', 00913 $value 00914 ); 00915 00916 return $value; 00917 } 00918 00919 00938 static function checkCss( $value ) { 00939 $value = self::normalizeCss( $value ); 00940 00941 // Reject problematic keywords and control characters 00942 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) { 00943 return '/* invalid control char */'; 00944 } elseif ( preg_match( 00945 '! expression 00946 | filter\s*: 00947 | accelerator\s*: 00948 | -o-link\s*: 00949 | -o-link-source\s*: 00950 | -o-replace\s*: 00951 | url\s*\( 00952 | image\s*\( 00953 | image-set\s*\( 00954 !ix', $value ) ) { 00955 return '/* insecure input */'; 00956 } 00957 return $value; 00958 } 00959 00964 static function cssDecodeCallback( $matches ) { 00965 if ( $matches[1] !== '' ) { 00966 // Line continuation 00967 return ''; 00968 } elseif ( $matches[2] !== '' ) { 00969 $char = codepointToUtf8( hexdec( $matches[2] ) ); 00970 } elseif ( $matches[3] !== '' ) { 00971 $char = $matches[3]; 00972 } else { 00973 $char = '\\'; 00974 } 00975 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { 00976 // These characters need to be escaped in strings 00977 // Clean up the escape sequence to avoid parsing errors by clients 00978 return '\\' . dechex( ord( $char ) ) . ' '; 00979 } else { 00980 // Decode unnecessary escape 00981 return $char; 00982 } 00983 } 00984 01004 static function fixTagAttributes( $text, $element ) { 01005 if ( trim( $text ) == '' ) { 01006 return ''; 01007 } 01008 01009 $decoded = Sanitizer::decodeTagAttributes( $text ); 01010 $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); 01011 01012 return Sanitizer::safeEncodeTagAttributes( $stripped ); 01013 } 01014 01020 static function encodeAttribute( $text ) { 01021 $encValue = htmlspecialchars( $text, ENT_QUOTES ); 01022 01023 // Whitespace is normalized during attribute decoding, 01024 // so if we've been passed non-spaces we must encode them 01025 // ahead of time or they won't be preserved. 01026 $encValue = strtr( $encValue, array( 01027 "\n" => ' ', 01028 "\r" => ' ', 01029 "\t" => '	', 01030 ) ); 01031 01032 return $encValue; 01033 } 01034 01041 static function safeEncodeAttribute( $text ) { 01042 $encValue = Sanitizer::encodeAttribute( $text ); 01043 01044 # Templates and links may be expanded in later parsing, 01045 # creating invalid or dangerous output. Suppress this. 01046 $encValue = strtr( $encValue, array( 01047 '<' => '<', // This should never happen, 01048 '>' => '>', // we've received invalid input 01049 '"' => '"', // which should have been escaped. 01050 '{' => '{', 01051 '[' => '[', 01052 "''" => '''', 01053 'ISBN' => 'ISBN', 01054 'RFC' => 'RFC', 01055 'PMID' => 'PMID', 01056 '|' => '|', 01057 '__' => '__', 01058 ) ); 01059 01060 # Stupid hack 01061 $encValue = preg_replace_callback( 01062 '/((?i)' . wfUrlProtocols() . ')/', 01063 array( 'Sanitizer', 'armorLinksCallback' ), 01064 $encValue ); 01065 return $encValue; 01066 } 01067 01099 static function escapeId( $id, $options = array() ) { 01100 global $wgExperimentalHtmlIds; 01101 $options = (array)$options; 01102 01103 if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { 01104 $id = Sanitizer::decodeCharReferences( $id ); 01105 $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); 01106 $id = trim( $id, '_' ); 01107 if ( $id === '' ) { 01108 # Must have been all whitespace to start with. 01109 return '_'; 01110 } else { 01111 return $id; 01112 } 01113 } 01114 01115 # HTML4-style escaping 01116 static $replace = array( 01117 '%3A' => ':', 01118 '%' => '.' 01119 ); 01120 01121 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); 01122 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); 01123 01124 if ( !preg_match( '/^[a-zA-Z]/', $id ) 01125 && !in_array( 'noninitial', $options ) ) { 01126 // Initial character must be a letter! 01127 $id = "x$id"; 01128 } 01129 return $id; 01130 } 01131 01143 static function escapeClass( $class ) { 01144 // Convert ugly stuff to underscores and kill underscores in ugly places 01145 return rtrim( preg_replace( 01146 array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ), 01147 '_', 01148 $class ), '_' ); 01149 } 01150 01158 static function escapeHtmlAllowEntities( $html ) { 01159 $html = Sanitizer::decodeCharReferences( $html ); 01160 # It seems wise to escape ' as well as ", as a matter of course. Can't 01161 # hurt. 01162 $html = htmlspecialchars( $html, ENT_QUOTES ); 01163 return $html; 01164 } 01165 01171 private static function armorLinksCallback( $matches ) { 01172 return str_replace( ':', ':', $matches[1] ); 01173 } 01174 01183 public static function decodeTagAttributes( $text ) { 01184 if ( trim( $text ) == '' ) { 01185 return array(); 01186 } 01187 01188 $attribs = array(); 01189 $pairs = array(); 01190 if ( !preg_match_all( 01191 self::getAttribsRegex(), 01192 $text, 01193 $pairs, 01194 PREG_SET_ORDER ) ) { 01195 return $attribs; 01196 } 01197 01198 foreach ( $pairs as $set ) { 01199 $attribute = strtolower( $set[1] ); 01200 $value = Sanitizer::getTagAttributeCallback( $set ); 01201 01202 // Normalize whitespace 01203 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); 01204 $value = trim( $value ); 01205 01206 // Decode character references 01207 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); 01208 } 01209 return $attribs; 01210 } 01211 01219 public static function safeEncodeTagAttributes( $assoc_array ) { 01220 $attribs = array(); 01221 foreach ( $assoc_array as $attribute => $value ) { 01222 $encAttribute = htmlspecialchars( $attribute ); 01223 $encValue = Sanitizer::safeEncodeAttribute( $value ); 01224 01225 $attribs[] = "$encAttribute=\"$encValue\""; 01226 } 01227 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; 01228 } 01229 01238 private static function getTagAttributeCallback( $set ) { 01239 if ( isset( $set[6] ) ) { 01240 # Illegal #XXXXXX color with no quotes. 01241 return $set[6]; 01242 } elseif ( isset( $set[5] ) ) { 01243 # No quotes. 01244 return $set[5]; 01245 } elseif ( isset( $set[4] ) ) { 01246 # Single-quoted 01247 return $set[4]; 01248 } elseif ( isset( $set[3] ) ) { 01249 # Double-quoted 01250 return $set[3]; 01251 } elseif ( !isset( $set[2] ) ) { 01252 # In XHTML, attributes must have a value. 01253 # For 'reduced' form, return explicitly the attribute name here. 01254 return $set[1]; 01255 } else { 01256 throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); 01257 } 01258 } 01259 01272 private static function normalizeAttributeValue( $text ) { 01273 return str_replace( '"', '"', 01274 self::normalizeWhitespace( 01275 Sanitizer::normalizeCharReferences( $text ) ) ); 01276 } 01277 01282 private static function normalizeWhitespace( $text ) { 01283 return preg_replace( 01284 '/\r\n|[\x20\x0d\x0a\x09]/', 01285 ' ', 01286 $text ); 01287 } 01288 01297 static function normalizeSectionNameWhitespace( $section ) { 01298 return trim( preg_replace( '/[ _]+/', ' ', $section ) ); 01299 } 01300 01316 static function normalizeCharReferences( $text ) { 01317 return preg_replace_callback( 01318 self::CHAR_REFS_REGEX, 01319 array( 'Sanitizer', 'normalizeCharReferencesCallback' ), 01320 $text ); 01321 } 01322 01327 static function normalizeCharReferencesCallback( $matches ) { 01328 $ret = null; 01329 if ( $matches[1] != '' ) { 01330 $ret = Sanitizer::normalizeEntity( $matches[1] ); 01331 } elseif ( $matches[2] != '' ) { 01332 $ret = Sanitizer::decCharReference( $matches[2] ); 01333 } elseif ( $matches[3] != '' ) { 01334 $ret = Sanitizer::hexCharReference( $matches[3] ); 01335 } 01336 if ( is_null( $ret ) ) { 01337 return htmlspecialchars( $matches[0] ); 01338 } else { 01339 return $ret; 01340 } 01341 } 01342 01353 static function normalizeEntity( $name ) { 01354 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01355 return '&' . self::$htmlEntityAliases[$name] . ';'; 01356 } elseif ( in_array( $name, 01357 array( 'lt', 'gt', 'amp', 'quot' ) ) ) { 01358 return "&$name;"; 01359 } elseif ( isset( self::$htmlEntities[$name] ) ) { 01360 return '&#' . self::$htmlEntities[$name] . ';'; 01361 } else { 01362 return "&$name;"; 01363 } 01364 } 01365 01370 static function decCharReference( $codepoint ) { 01371 $point = intval( $codepoint ); 01372 if ( Sanitizer::validateCodepoint( $point ) ) { 01373 return sprintf( '&#%d;', $point ); 01374 } else { 01375 return null; 01376 } 01377 } 01378 01383 static function hexCharReference( $codepoint ) { 01384 $point = hexdec( $codepoint ); 01385 if ( Sanitizer::validateCodepoint( $point ) ) { 01386 return sprintf( '&#x%x;', $point ); 01387 } else { 01388 return null; 01389 } 01390 } 01391 01397 private static function validateCodepoint( $codepoint ) { 01398 return $codepoint == 0x09 01399 || $codepoint == 0x0a 01400 || $codepoint == 0x0d 01401 || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff ) 01402 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) 01403 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); 01404 } 01405 01413 public static function decodeCharReferences( $text ) { 01414 return preg_replace_callback( 01415 self::CHAR_REFS_REGEX, 01416 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01417 $text ); 01418 } 01419 01430 public static function decodeCharReferencesAndNormalize( $text ) { 01431 global $wgContLang; 01432 $text = preg_replace_callback( 01433 self::CHAR_REFS_REGEX, 01434 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01435 $text, /* limit */ -1, $count ); 01436 01437 if ( $count ) { 01438 return $wgContLang->normalize( $text ); 01439 } else { 01440 return $text; 01441 } 01442 } 01443 01448 static function decodeCharReferencesCallback( $matches ) { 01449 if ( $matches[1] != '' ) { 01450 return Sanitizer::decodeEntity( $matches[1] ); 01451 } elseif ( $matches[2] != '' ) { 01452 return Sanitizer::decodeChar( intval( $matches[2] ) ); 01453 } elseif ( $matches[3] != '' ) { 01454 return Sanitizer::decodeChar( hexdec( $matches[3] ) ); 01455 } 01456 # Last case should be an ampersand by itself 01457 return $matches[0]; 01458 } 01459 01467 static function decodeChar( $codepoint ) { 01468 if ( Sanitizer::validateCodepoint( $codepoint ) ) { 01469 return codepointToUtf8( $codepoint ); 01470 } else { 01471 return UTF8_REPLACEMENT; 01472 } 01473 } 01474 01483 static function decodeEntity( $name ) { 01484 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01485 $name = self::$htmlEntityAliases[$name]; 01486 } 01487 if ( isset( self::$htmlEntities[$name] ) ) { 01488 return codepointToUtf8( self::$htmlEntities[$name] ); 01489 } else { 01490 return "&$name;"; 01491 } 01492 } 01493 01500 static function attributeWhitelist( $element ) { 01501 $list = Sanitizer::setupAttributeWhitelist(); 01502 return isset( $list[$element] ) 01503 ? $list[$element] 01504 : array(); 01505 } 01506 01512 static function setupAttributeWhitelist() { 01513 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes; 01514 01515 static $whitelist, $staticInitialised; 01516 $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) ); 01517 01518 if ( isset( $whitelist ) && $staticInitialised == $globalContext ) { 01519 return $whitelist; 01520 } 01521 01522 $common = array( 01523 # HTML 01524 'id', 01525 'class', 01526 'style', 01527 'lang', 01528 'dir', 01529 'title', 01530 01531 # WAI-ARIA 01532 'role', 01533 ); 01534 01535 if ( $wgAllowRdfaAttributes ) { 01536 # RDFa attributes as specified in section 9 of 01537 # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 01538 $common = array_merge( $common, array( 01539 'about', 'property', 'resource', 'datatype', 'typeof', 01540 ) ); 01541 } 01542 01543 if ( $wgAllowMicrodataAttributes ) { 01544 # add HTML5 microdata tags as specified by 01545 # http://www.whatwg.org/html/microdata.html#the-microdata-model 01546 $common = array_merge( $common, array( 01547 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype' 01548 ) ); 01549 } 01550 01551 $block = array_merge( $common, array( 'align' ) ); 01552 $tablealign = array( 'align', 'valign' ); 01553 $tablecell = array( 01554 'abbr', 01555 'axis', 01556 'headers', 01557 'scope', 01558 'rowspan', 01559 'colspan', 01560 'nowrap', # deprecated 01561 'width', # deprecated 01562 'height', # deprecated 01563 'bgcolor', # deprecated 01564 ); 01565 01566 # Numbers refer to sections in HTML 4.01 standard describing the element. 01567 # See: http://www.w3.org/TR/html4/ 01568 $whitelist = array( 01569 # 7.5.4 01570 'div' => $block, 01571 'center' => $common, # deprecated 01572 'span' => $common, 01573 01574 # 7.5.5 01575 'h1' => $block, 01576 'h2' => $block, 01577 'h3' => $block, 01578 'h4' => $block, 01579 'h5' => $block, 01580 'h6' => $block, 01581 01582 # 7.5.6 01583 # address 01584 01585 # 8.2.4 01586 'bdo' => $common, 01587 01588 # 9.2.1 01589 'em' => $common, 01590 'strong' => $common, 01591 'cite' => $common, 01592 'dfn' => $common, 01593 'code' => $common, 01594 'samp' => $common, 01595 'kbd' => $common, 01596 'var' => $common, 01597 'abbr' => $common, 01598 # acronym 01599 01600 # 9.2.2 01601 'blockquote' => array_merge( $common, array( 'cite' ) ), 01602 'q' => array_merge( $common, array( 'cite' ) ), 01603 01604 # 9.2.3 01605 'sub' => $common, 01606 'sup' => $common, 01607 01608 # 9.3.1 01609 'p' => $block, 01610 01611 # 9.3.2 01612 'br' => array_merge( $common, array( 'clear' ) ), 01613 01614 # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element 01615 'wbr' => $common, 01616 01617 # 9.3.4 01618 'pre' => array_merge( $common, array( 'width' ) ), 01619 01620 # 9.4 01621 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), 01622 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), 01623 01624 # 10.2 01625 'ul' => array_merge( $common, array( 'type' ) ), 01626 'ol' => array_merge( $common, array( 'type', 'start' ) ), 01627 'li' => array_merge( $common, array( 'type', 'value' ) ), 01628 01629 # 10.3 01630 'dl' => $common, 01631 'dd' => $common, 01632 'dt' => $common, 01633 01634 # 11.2.1 01635 'table' => array_merge( $common, 01636 array( 'summary', 'width', 'border', 'frame', 01637 'rules', 'cellspacing', 'cellpadding', 01638 'align', 'bgcolor', 01639 ) ), 01640 01641 # 11.2.2 01642 'caption' => $block, 01643 01644 # 11.2.3 01645 'thead' => $common, 01646 'tfoot' => $common, 01647 'tbody' => $common, 01648 01649 # 11.2.4 01650 'colgroup' => array_merge( $common, array( 'span' ) ), 01651 'col' => array_merge( $common, array( 'span' ) ), 01652 01653 # 11.2.5 01654 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), 01655 01656 # 11.2.6 01657 'td' => array_merge( $common, $tablecell, $tablealign ), 01658 'th' => array_merge( $common, $tablecell, $tablealign ), 01659 01660 # 12.2 01661 # NOTE: <a> is not allowed directly, but the attrib 01662 # whitelist is used from the Parser object 01663 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa 01664 01665 # 13.2 01666 # Not usually allowed, but may be used for extension-style hooks 01667 # such as <math> when it is rasterized, or if $wgAllowImageTag is 01668 # true 01669 'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ), 01670 01671 # 15.2.1 01672 'tt' => $common, 01673 'b' => $common, 01674 'i' => $common, 01675 'big' => $common, 01676 'small' => $common, 01677 'strike' => $common, 01678 's' => $common, 01679 'u' => $common, 01680 01681 # 15.2.2 01682 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), 01683 # basefont 01684 01685 # 15.3 01686 'hr' => array_merge( $common, array( 'width' ) ), 01687 01688 # HTML Ruby annotation text module, simple ruby only. 01689 # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element 01690 'ruby' => $common, 01691 # rbc 01692 # rtc 01693 'rb' => $common, 01694 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), 01695 'rp' => $common, 01696 01697 # MathML root element, where used for extensions 01698 # 'title' may not be 100% valid here; it's XHTML 01699 # http://www.w3.org/TR/REC-MathML/ 01700 'math' => array( 'class', 'style', 'id', 'title' ), 01701 01702 # HTML 5 section 4.6 01703 'bdi' => $common, 01704 01705 # HTML5 elements, defined by: 01706 # http://www.whatwg.org/html/ 01707 'data' => array_merge( $common, array( 'value' ) ), 01708 'time' => array_merge( $common, array( 'datetime' ) ), 01709 'mark' => $common, 01710 01711 // meta and link are only permitted by removeHTMLtags when Microdata 01712 // is enabled so we don't bother adding a conditional to hide these 01713 // Also meta and link are only valid in WikiText as Microdata elements 01714 // (ie: validateTag rejects tags missing the attributes needed for Microdata) 01715 // So we don't bother including $common attributes that have no purpose. 01716 'meta' => array( 'itemprop', 'content' ), 01717 'link' => array( 'itemprop', 'href' ), 01718 ); 01719 01720 $staticInitialised = $globalContext; 01721 01722 return $whitelist; 01723 } 01724 01735 static function stripAllTags( $text ) { 01736 # Actual <tags> 01737 $text = StringUtils::delimiterReplace( '<', '>', '', $text ); 01738 01739 # Normalize &entities and whitespace 01740 $text = self::decodeCharReferences( $text ); 01741 $text = self::normalizeWhitespace( $text ); 01742 01743 return $text; 01744 } 01745 01755 static function hackDocType() { 01756 $out = "<!DOCTYPE html [\n"; 01757 foreach ( self::$htmlEntities as $entity => $codepoint ) { 01758 $out .= "<!ENTITY $entity \"&#$codepoint;\">"; 01759 } 01760 $out .= "]>\n"; 01761 return $out; 01762 } 01763 01768 static function cleanUrl( $url ) { 01769 # Normalize any HTML entities in input. They will be 01770 # re-escaped by makeExternalLink(). 01771 $url = Sanitizer::decodeCharReferences( $url ); 01772 01773 # Escape any control characters introduced by the above step 01774 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', 01775 array( __CLASS__, 'cleanUrlCallback' ), $url ); 01776 01777 # Validate hostname portion 01778 $matches = array(); 01779 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { 01780 list( /* $whole */, $protocol, $host, $rest ) = $matches; 01781 01782 // Characters that will be ignored in IDNs. 01783 // http://tools.ietf.org/html/3454#section-3.1 01784 // Strip them before further processing so blacklists and such work. 01785 $strip = "/ 01786 \\s| # general whitespace 01787 \xc2\xad| # 00ad SOFT HYPHEN 01788 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 01789 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 01790 \xe2\x81\xa0| # 2060 WORD JOINER 01791 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 01792 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER 01793 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 01794 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 01795 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 01796 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 01797 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 01798 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16 01799 /xuD"; 01800 01801 $host = preg_replace( $strip, '', $host ); 01802 01803 // @todo FIXME: Validate hostnames here 01804 01805 return $protocol . $host . $rest; 01806 } else { 01807 return $url; 01808 } 01809 } 01810 01815 static function cleanUrlCallback( $matches ) { 01816 return urlencode( $matches[0] ); 01817 } 01818 01847 public static function validateEmail( $addr ) { 01848 $result = null; 01849 if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) { 01850 return $result; 01851 } 01852 01853 // Please note strings below are enclosed in brackets [], this make the 01854 // hyphen "-" a range indicator. Hence it is double backslashed below. 01855 // See bug 26948 01856 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~"; 01857 $rfc1034_ldh_str = "a-z0-9\\-"; 01858 01859 $html5_email_regexp = "/ 01860 ^ # start of string 01861 [$rfc5322_atext\\.]+ # user part which is liberal :p 01862 @ # 'apostrophe' 01863 [$rfc1034_ldh_str]+ # First domain part 01864 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot 01865 $ # End of string 01866 /ix"; // case Insensitive, eXtended 01867 01868 return (bool)preg_match( $html5_email_regexp, $addr ); 01869 } 01870 }