MediaWiki
REL1_22
|
00001 <?php 00031 class Sanitizer { 00036 const CHAR_REFS_REGEX = 00037 '/&([A-Za-z0-9\x80-\xff]+); 00038 |&\#([0-9]+); 00039 |&\#[xX]([0-9A-Fa-f]+); 00040 |(&)/x'; 00041 00050 const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; 00051 const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; 00052 00058 private static $htmlEntities = array( 00059 'Aacute' => 193, 00060 'aacute' => 225, 00061 'Acirc' => 194, 00062 'acirc' => 226, 00063 'acute' => 180, 00064 'AElig' => 198, 00065 'aelig' => 230, 00066 'Agrave' => 192, 00067 'agrave' => 224, 00068 'alefsym' => 8501, 00069 'Alpha' => 913, 00070 'alpha' => 945, 00071 'amp' => 38, 00072 'and' => 8743, 00073 'ang' => 8736, 00074 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. 00075 'Aring' => 197, 00076 'aring' => 229, 00077 'asymp' => 8776, 00078 'Atilde' => 195, 00079 'atilde' => 227, 00080 'Auml' => 196, 00081 'auml' => 228, 00082 'bdquo' => 8222, 00083 'Beta' => 914, 00084 'beta' => 946, 00085 'brvbar' => 166, 00086 'bull' => 8226, 00087 'cap' => 8745, 00088 'Ccedil' => 199, 00089 'ccedil' => 231, 00090 'cedil' => 184, 00091 'cent' => 162, 00092 'Chi' => 935, 00093 'chi' => 967, 00094 'circ' => 710, 00095 'clubs' => 9827, 00096 'cong' => 8773, 00097 'copy' => 169, 00098 'crarr' => 8629, 00099 'cup' => 8746, 00100 'curren' => 164, 00101 'dagger' => 8224, 00102 'Dagger' => 8225, 00103 'darr' => 8595, 00104 'dArr' => 8659, 00105 'deg' => 176, 00106 'Delta' => 916, 00107 'delta' => 948, 00108 'diams' => 9830, 00109 'divide' => 247, 00110 'Eacute' => 201, 00111 'eacute' => 233, 00112 'Ecirc' => 202, 00113 'ecirc' => 234, 00114 'Egrave' => 200, 00115 'egrave' => 232, 00116 'empty' => 8709, 00117 'emsp' => 8195, 00118 'ensp' => 8194, 00119 'Epsilon' => 917, 00120 'epsilon' => 949, 00121 'equiv' => 8801, 00122 'Eta' => 919, 00123 'eta' => 951, 00124 'ETH' => 208, 00125 'eth' => 240, 00126 'Euml' => 203, 00127 'euml' => 235, 00128 'euro' => 8364, 00129 'exist' => 8707, 00130 'fnof' => 402, 00131 'forall' => 8704, 00132 'frac12' => 189, 00133 'frac14' => 188, 00134 'frac34' => 190, 00135 'frasl' => 8260, 00136 'Gamma' => 915, 00137 'gamma' => 947, 00138 'ge' => 8805, 00139 'gt' => 62, 00140 'harr' => 8596, 00141 'hArr' => 8660, 00142 'hearts' => 9829, 00143 'hellip' => 8230, 00144 'Iacute' => 205, 00145 'iacute' => 237, 00146 'Icirc' => 206, 00147 'icirc' => 238, 00148 'iexcl' => 161, 00149 'Igrave' => 204, 00150 'igrave' => 236, 00151 'image' => 8465, 00152 'infin' => 8734, 00153 'int' => 8747, 00154 'Iota' => 921, 00155 'iota' => 953, 00156 'iquest' => 191, 00157 'isin' => 8712, 00158 'Iuml' => 207, 00159 'iuml' => 239, 00160 'Kappa' => 922, 00161 'kappa' => 954, 00162 'Lambda' => 923, 00163 'lambda' => 955, 00164 'lang' => 9001, 00165 'laquo' => 171, 00166 'larr' => 8592, 00167 'lArr' => 8656, 00168 'lceil' => 8968, 00169 'ldquo' => 8220, 00170 'le' => 8804, 00171 'lfloor' => 8970, 00172 'lowast' => 8727, 00173 'loz' => 9674, 00174 'lrm' => 8206, 00175 'lsaquo' => 8249, 00176 'lsquo' => 8216, 00177 'lt' => 60, 00178 'macr' => 175, 00179 'mdash' => 8212, 00180 'micro' => 181, 00181 'middot' => 183, 00182 'minus' => 8722, 00183 'Mu' => 924, 00184 'mu' => 956, 00185 'nabla' => 8711, 00186 'nbsp' => 160, 00187 'ndash' => 8211, 00188 'ne' => 8800, 00189 'ni' => 8715, 00190 'not' => 172, 00191 'notin' => 8713, 00192 'nsub' => 8836, 00193 'Ntilde' => 209, 00194 'ntilde' => 241, 00195 'Nu' => 925, 00196 'nu' => 957, 00197 'Oacute' => 211, 00198 'oacute' => 243, 00199 'Ocirc' => 212, 00200 'ocirc' => 244, 00201 'OElig' => 338, 00202 'oelig' => 339, 00203 'Ograve' => 210, 00204 'ograve' => 242, 00205 'oline' => 8254, 00206 'Omega' => 937, 00207 'omega' => 969, 00208 'Omicron' => 927, 00209 'omicron' => 959, 00210 'oplus' => 8853, 00211 'or' => 8744, 00212 'ordf' => 170, 00213 'ordm' => 186, 00214 'Oslash' => 216, 00215 'oslash' => 248, 00216 'Otilde' => 213, 00217 'otilde' => 245, 00218 'otimes' => 8855, 00219 'Ouml' => 214, 00220 'ouml' => 246, 00221 'para' => 182, 00222 'part' => 8706, 00223 'permil' => 8240, 00224 'perp' => 8869, 00225 'Phi' => 934, 00226 'phi' => 966, 00227 'Pi' => 928, 00228 'pi' => 960, 00229 'piv' => 982, 00230 'plusmn' => 177, 00231 'pound' => 163, 00232 'prime' => 8242, 00233 'Prime' => 8243, 00234 'prod' => 8719, 00235 'prop' => 8733, 00236 'Psi' => 936, 00237 'psi' => 968, 00238 'quot' => 34, 00239 'radic' => 8730, 00240 'rang' => 9002, 00241 'raquo' => 187, 00242 'rarr' => 8594, 00243 'rArr' => 8658, 00244 'rceil' => 8969, 00245 'rdquo' => 8221, 00246 'real' => 8476, 00247 'reg' => 174, 00248 'rfloor' => 8971, 00249 'Rho' => 929, 00250 'rho' => 961, 00251 'rlm' => 8207, 00252 'rsaquo' => 8250, 00253 'rsquo' => 8217, 00254 'sbquo' => 8218, 00255 'Scaron' => 352, 00256 'scaron' => 353, 00257 'sdot' => 8901, 00258 'sect' => 167, 00259 'shy' => 173, 00260 'Sigma' => 931, 00261 'sigma' => 963, 00262 'sigmaf' => 962, 00263 'sim' => 8764, 00264 'spades' => 9824, 00265 'sub' => 8834, 00266 'sube' => 8838, 00267 'sum' => 8721, 00268 'sup' => 8835, 00269 'sup1' => 185, 00270 'sup2' => 178, 00271 'sup3' => 179, 00272 'supe' => 8839, 00273 'szlig' => 223, 00274 'Tau' => 932, 00275 'tau' => 964, 00276 'there4' => 8756, 00277 'Theta' => 920, 00278 'theta' => 952, 00279 'thetasym' => 977, 00280 'thinsp' => 8201, 00281 'THORN' => 222, 00282 'thorn' => 254, 00283 'tilde' => 732, 00284 'times' => 215, 00285 'trade' => 8482, 00286 'Uacute' => 218, 00287 'uacute' => 250, 00288 'uarr' => 8593, 00289 'uArr' => 8657, 00290 'Ucirc' => 219, 00291 'ucirc' => 251, 00292 'Ugrave' => 217, 00293 'ugrave' => 249, 00294 'uml' => 168, 00295 'upsih' => 978, 00296 'Upsilon' => 933, 00297 'upsilon' => 965, 00298 'Uuml' => 220, 00299 'uuml' => 252, 00300 'weierp' => 8472, 00301 'Xi' => 926, 00302 'xi' => 958, 00303 'Yacute' => 221, 00304 'yacute' => 253, 00305 'yen' => 165, 00306 'Yuml' => 376, 00307 'yuml' => 255, 00308 'Zeta' => 918, 00309 'zeta' => 950, 00310 'zwj' => 8205, 00311 'zwnj' => 8204 00312 ); 00313 00317 private static $htmlEntityAliases = array( 00318 'רלמ' => 'rlm', 00319 'رلم' => 'rlm', 00320 ); 00321 00325 private static $attribsRegex; 00326 00332 static function getAttribsRegex() { 00333 if ( self::$attribsRegex === null ) { 00334 $attribFirst = '[:A-Z_a-z0-9]'; 00335 $attrib = '[:A-Z_a-z-.0-9]'; 00336 $space = '[\x09\x0a\x0d\x20]'; 00337 self::$attribsRegex = 00338 "/(?:^|$space)({$attribFirst}{$attrib}*) 00339 ($space*=$space* 00340 (?: 00341 # The attribute value: quoted or alone 00342 \"([^<\"]*)\" 00343 | '([^<']*)' 00344 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) 00345 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of 00346 # colors are specified like this. 00347 # We'll be normalizing it. 00348 ) 00349 )?(?=$space|\$)/sx"; 00350 } 00351 return self::$attribsRegex; 00352 } 00353 00366 static function removeHTMLtags( $text, $processCallback = null, 00367 $args = array(), $extratags = array(), $removetags = array() 00368 ) { 00369 global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag; 00370 00371 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, 00372 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; 00373 00374 wfProfileIn( __METHOD__ ); 00375 00376 // Base our staticInitialised variable off of the global config state so that if the globals 00377 // are changed (like in the screwed up test system) we will re-initialise the settings. 00378 $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) ); 00379 if ( !$staticInitialised || $staticInitialised != $globalContext ) { 00380 00381 $htmlpairsStatic = array( # Tags that must be closed 00382 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 00383 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 00384 'strike', 'strong', 'tt', 'var', 'div', 'center', 00385 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 00386 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn', 00387 'kbd', 'samp', 'data', 'time', 'mark' 00388 ); 00389 $htmlsingle = array( 00390 'br', 'wbr', 'hr', 'li', 'dt', 'dd' 00391 ); 00392 $htmlsingleonly = array( # Elements that cannot have close tags 00393 'br', 'wbr', 'hr' 00394 ); 00395 if ( $wgAllowMicrodataAttributes ) { 00396 $htmlsingle[] = $htmlsingleonly[] = 'meta'; 00397 $htmlsingle[] = $htmlsingleonly[] = 'link'; 00398 } 00399 $htmlnest = array( # Tags that can be nested--?? 00400 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 00401 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', 00402 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' 00403 ); 00404 $tabletags = array( # Can only appear inside table, we will close them 00405 'td', 'th', 'tr', 00406 ); 00407 $htmllist = array( # Tags used by list 00408 'ul', 'ol', 00409 ); 00410 $listtags = array( # Tags that can appear in a list 00411 'li', 00412 ); 00413 00414 if ( $wgAllowImageTag ) { 00415 $htmlsingle[] = 'img'; 00416 $htmlsingleonly[] = 'img'; 00417 } 00418 00419 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); 00420 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); 00421 00422 # Convert them all to hashtables for faster lookup 00423 $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 00424 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ); 00425 foreach ( $vars as $var ) { 00426 $$var = array_flip( $$var ); 00427 } 00428 $staticInitialised = $globalContext; 00429 } 00430 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays 00431 $extratags = array_flip( $extratags ); 00432 $removetags = array_flip( $removetags ); 00433 $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); 00434 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); 00435 00436 # Remove HTML comments 00437 $text = Sanitizer::removeHTMLcomments( $text ); 00438 $bits = explode( '<', $text ); 00439 $text = str_replace( '>', '>', array_shift( $bits ) ); 00440 if ( !$wgUseTidy ) { 00441 $tagstack = $tablestack = array(); 00442 foreach ( $bits as $x ) { 00443 $regs = array(); 00444 # $slash: Does the current element start with a '/'? 00445 # $t: Current element name 00446 # $params: String between element name and > 00447 # $brace: Ending '>' or '/>' 00448 # $rest: Everything until the next element of $bits 00449 if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { 00450 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00451 } else { 00452 $slash = $t = $params = $brace = $rest = null; 00453 } 00454 00455 $badtag = false; 00456 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00457 # Check our stack 00458 if ( $slash && isset( $htmlsingleonly[$t] ) ) { 00459 $badtag = true; 00460 } elseif ( $slash ) { 00461 # Closing a tag... is it the one we just opened? 00462 $ot = @array_pop( $tagstack ); 00463 if ( $ot != $t ) { 00464 if ( isset( $htmlsingleallowed[$ot] ) ) { 00465 # Pop all elements with an optional close tag 00466 # and see if we find a match below them 00467 $optstack = array(); 00468 array_push( $optstack, $ot ); 00469 wfSuppressWarnings(); 00470 $ot = array_pop( $tagstack ); 00471 wfRestoreWarnings(); 00472 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { 00473 array_push( $optstack, $ot ); 00474 wfSuppressWarnings(); 00475 $ot = array_pop( $tagstack ); 00476 wfRestoreWarnings(); 00477 } 00478 if ( $t != $ot ) { 00479 # No match. Push the optional elements back again 00480 $badtag = true; 00481 wfSuppressWarnings(); 00482 $ot = array_pop( $optstack ); 00483 wfRestoreWarnings(); 00484 while ( $ot ) { 00485 array_push( $tagstack, $ot ); 00486 wfSuppressWarnings(); 00487 $ot = array_pop( $optstack ); 00488 wfRestoreWarnings(); 00489 } 00490 } 00491 } else { 00492 @array_push( $tagstack, $ot ); 00493 # <li> can be nested in <ul> or <ol>, skip those cases: 00494 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) { 00495 $badtag = true; 00496 } 00497 } 00498 } else { 00499 if ( $t == 'table' ) { 00500 $tagstack = array_pop( $tablestack ); 00501 } 00502 } 00503 $newparams = ''; 00504 } else { 00505 # Keep track for later 00506 if ( isset( $tabletags[$t] ) && 00507 !in_array( 'table', $tagstack ) ) { 00508 $badtag = true; 00509 } elseif ( in_array( $t, $tagstack ) && 00510 !isset( $htmlnest[$t] ) ) { 00511 $badtag = true; 00512 # Is it a self closed htmlpair ? (bug 5487) 00513 } elseif ( $brace == '/>' && 00514 isset( $htmlpairs[$t] ) ) { 00515 $badtag = true; 00516 } elseif ( isset( $htmlsingleonly[$t] ) ) { 00517 # Hack to force empty tag for unclosable elements 00518 $brace = '/>'; 00519 } elseif ( isset( $htmlsingle[$t] ) ) { 00520 # Hack to not close $htmlsingle tags 00521 $brace = null; 00522 # Still need to push this optionally-closed tag to 00523 # the tag stack so that we can match end tags 00524 # instead of marking them as bad. 00525 array_push( $tagstack, $t ); 00526 } elseif ( isset( $tabletags[$t] ) 00527 && in_array( $t, $tagstack ) ) { 00528 // New table tag but forgot to close the previous one 00529 $text .= "</$t>"; 00530 } else { 00531 if ( $t == 'table' ) { 00532 array_push( $tablestack, $tagstack ); 00533 $tagstack = array(); 00534 } 00535 array_push( $tagstack, $t ); 00536 } 00537 00538 # Replace any variables or template parameters with 00539 # plaintext results. 00540 if ( is_callable( $processCallback ) ) { 00541 call_user_func_array( $processCallback, array( &$params, $args ) ); 00542 } 00543 00544 if ( !Sanitizer::validateTag( $params, $t ) ) { 00545 $badtag = true; 00546 } 00547 00548 # Strip non-approved attributes from the tag 00549 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00550 } 00551 if ( !$badtag ) { 00552 $rest = str_replace( '>', '>', $rest ); 00553 $close = ( $brace == '/>' && !$slash ) ? ' /' : ''; 00554 $text .= "<$slash$t$newparams$close>$rest"; 00555 continue; 00556 } 00557 } 00558 $text .= '<' . str_replace( '>', '>', $x ); 00559 } 00560 # Close off any remaining tags 00561 while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) { 00562 $text .= "</$t>\n"; 00563 if ( $t == 'table' ) { 00564 $tagstack = array_pop( $tablestack ); 00565 } 00566 } 00567 } else { 00568 # this might be possible using tidy itself 00569 foreach ( $bits as $x ) { 00570 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', 00571 $x, $regs ); 00572 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00573 $badtag = false; 00574 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00575 if ( is_callable( $processCallback ) ) { 00576 call_user_func_array( $processCallback, array( &$params, $args ) ); 00577 } 00578 00579 if ( !Sanitizer::validateTag( $params, $t ) ) { 00580 $badtag = true; 00581 } 00582 00583 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00584 if ( !$badtag ) { 00585 $rest = str_replace( '>', '>', $rest ); 00586 $text .= "<$slash$t$newparams$brace$rest"; 00587 continue; 00588 } 00589 } 00590 $text .= '<' . str_replace( '>', '>', $x ); 00591 } 00592 } 00593 wfProfileOut( __METHOD__ ); 00594 return $text; 00595 } 00596 00607 static function removeHTMLcomments( $text ) { 00608 wfProfileIn( __METHOD__ ); 00609 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) { 00610 $end = strpos( $text, '-->', $start + 4 ); 00611 if ( $end === false ) { 00612 # Unterminated comment; bail out 00613 break; 00614 } 00615 00616 $end += 3; 00617 00618 # Trim space and newline if the comment is both 00619 # preceded and followed by a newline 00620 $spaceStart = max( $start - 1, 0 ); 00621 $spaceLen = $end - $spaceStart; 00622 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) { 00623 $spaceStart--; 00624 $spaceLen++; 00625 } 00626 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) { 00627 $spaceLen++; 00628 } 00629 if ( substr( $text, $spaceStart, 1 ) === "\n" 00630 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) { 00631 # Remove the comment, leading and trailing 00632 # spaces, and leave only one newline. 00633 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 ); 00634 } 00635 else { 00636 # Remove just the comment. 00637 $text = substr_replace( $text, '', $start, $end - $start ); 00638 } 00639 } 00640 wfProfileOut( __METHOD__ ); 00641 return $text; 00642 } 00643 00656 static function validateTag( $params, $element ) { 00657 $params = Sanitizer::decodeTagAttributes( $params ); 00658 00659 if ( $element == 'meta' || $element == 'link' ) { 00660 if ( !isset( $params['itemprop'] ) ) { 00661 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content 00662 return false; 00663 } 00664 if ( $element == 'meta' && !isset( $params['content'] ) ) { 00665 // <meta> must have a content="" for the itemprop 00666 return false; 00667 } 00668 if ( $element == 'link' && !isset( $params['href'] ) ) { 00669 // <link> must have an associated href="" 00670 return false; 00671 } 00672 } 00673 00674 return true; 00675 } 00676 00692 static function validateTagAttributes( $attribs, $element ) { 00693 return Sanitizer::validateAttributes( $attribs, 00694 Sanitizer::attributeWhitelist( $element ) ); 00695 } 00696 00712 static function validateAttributes( $attribs, $whitelist ) { 00713 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes; 00714 00715 $whitelist = array_flip( $whitelist ); 00716 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; 00717 00718 $out = array(); 00719 foreach ( $attribs as $attribute => $value ) { 00720 #allow XML namespace declaration if RDFa is enabled 00721 if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { 00722 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00723 $out[$attribute] = $value; 00724 } 00725 00726 continue; 00727 } 00728 00729 # Allow any attribute beginning with "data-" 00730 if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) { 00731 continue; 00732 } 00733 00734 # Strip javascript "expression" from stylesheets. 00735 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp 00736 if ( $attribute == 'style' ) { 00737 $value = Sanitizer::checkCss( $value ); 00738 } 00739 00740 if ( $attribute === 'id' ) { 00741 $value = Sanitizer::escapeId( $value, 'noninitial' ); 00742 } 00743 00744 # WAI-ARIA 00745 # http://www.w3.org/TR/wai-aria/ 00746 # http://www.whatwg.org/html/elements.html#wai-aria 00747 # For now we only support role="presentation" until we work out what roles should be 00748 # usable by content and we ensure that our code explicitly rejects patterns that 00749 # violate HTML5's ARIA restrictions. 00750 if ( $attribute === 'role' && $value !== 'presentation' ) { 00751 continue; 00752 } 00753 00754 // RDFa and microdata properties allow URLs, URIs and/or CURIs. 00755 // Check them for sanity. 00756 if ( $attribute === 'rel' || $attribute === 'rev' 00757 # RDFa 00758 || $attribute === 'about' || $attribute === 'property' 00759 || $attribute === 'resource' || $attribute === 'datatype' 00760 || $attribute === 'typeof' 00761 # HTML5 microdata 00762 || $attribute === 'itemid' || $attribute === 'itemprop' 00763 || $attribute === 'itemref' || $attribute === 'itemscope' 00764 || $attribute === 'itemtype' 00765 ) { 00766 //Paranoia. Allow "simple" values but suppress javascript 00767 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00768 continue; 00769 } 00770 } 00771 00772 # NOTE: even though elements using href/src are not allowed directly, supply 00773 # validation code that can be used by tag hook handlers, etc 00774 if ( $attribute === 'href' || $attribute === 'src' ) { 00775 if ( !preg_match( $hrefExp, $value ) ) { 00776 continue; //drop any href or src attributes not using an allowed protocol. 00777 // NOTE: this also drops all relative URLs 00778 } 00779 } 00780 00781 // If this attribute was previously set, override it. 00782 // Output should only have one attribute of each name. 00783 $out[$attribute] = $value; 00784 } 00785 00786 if ( $wgAllowMicrodataAttributes ) { 00787 # itemtype, itemid, itemref don't make sense without itemscope 00788 if ( !array_key_exists( 'itemscope', $out ) ) { 00789 unset( $out['itemtype'] ); 00790 unset( $out['itemid'] ); 00791 unset( $out['itemref'] ); 00792 } 00793 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. 00794 } 00795 return $out; 00796 } 00797 00808 static function mergeAttributes( $a, $b ) { 00809 $out = array_merge( $a, $b ); 00810 if ( isset( $a['class'] ) && isset( $b['class'] ) 00811 && is_string( $a['class'] ) && is_string( $b['class'] ) 00812 && $a['class'] !== $b['class'] 00813 ) { 00814 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", 00815 -1, PREG_SPLIT_NO_EMPTY ); 00816 $out['class'] = implode( ' ', array_unique( $classes ) ); 00817 } 00818 return $out; 00819 } 00820 00830 public static function normalizeCss( $value ) { 00831 00832 // Decode character references like { 00833 $value = Sanitizer::decodeCharReferences( $value ); 00834 00835 // Decode escape sequences and line continuation 00836 // See the grammar in the CSS 2 spec, appendix D. 00837 // This has to be done AFTER decoding character references. 00838 // This means it isn't possible for this function to return 00839 // unsanitized escape sequences. It is possible to manufacture 00840 // input that contains character references that decode to 00841 // escape sequences that decode to character references, but 00842 // it's OK for the return value to contain character references 00843 // because the caller is supposed to escape those anyway. 00844 static $decodeRegex; 00845 if ( !$decodeRegex ) { 00846 $space = '[\\x20\\t\\r\\n\\f]'; 00847 $nl = '(?:\\n|\\r\\n|\\r|\\f)'; 00848 $backslash = '\\\\'; 00849 $decodeRegex = "/ $backslash 00850 (?: 00851 ($nl) | # 1. Line continuation 00852 ([0-9A-Fa-f]{1,6})$space? | # 2. character number 00853 (.) | # 3. backslash cancelling special meaning 00854 () | # 4. backslash at end of string 00855 )/xu"; 00856 } 00857 $value = preg_replace_callback( $decodeRegex, 00858 array( __CLASS__, 'cssDecodeCallback' ), $value ); 00859 00860 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii 00861 $value = preg_replace_callback( 00862 '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088) 00863 function ( $matches ) { 00864 $cp = utf8ToCodepoint( $matches[0] ); 00865 if ( $cp === false ) { 00866 return ''; 00867 } 00868 return chr( $cp - 65248 ); // ASCII range \x21-\x7A 00869 }, 00870 $value 00871 ); 00872 00873 // Convert more characters IE6 might treat as ascii 00874 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D 00875 $value = str_replace( 00876 array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ), 00877 array( 'r', 'n', 'n', 'l', 'i', '(', '(' ), 00878 $value 00879 ); 00880 00881 // Let the value through if it's nothing but a single comment, to 00882 // allow other functions which may reject it to pass some error 00883 // message through. 00884 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) { 00885 // Remove any comments; IE gets token splitting wrong 00886 // This must be done AFTER decoding character references and 00887 // escape sequences, because those steps can introduce comments 00888 // This step cannot introduce character references or escape 00889 // sequences, because it replaces comments with spaces rather 00890 // than removing them completely. 00891 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); 00892 00893 // Remove anything after a comment-start token, to guard against 00894 // incorrect client implementations. 00895 $commentPos = strpos( $value, '/*' ); 00896 if ( $commentPos !== false ) { 00897 $value = substr( $value, 0, $commentPos ); 00898 } 00899 } 00900 00901 // S followed by repeat, iteration, or prolonged sound marks, 00902 // which IE will treat as "ss" 00903 $value = preg_replace( 00904 '/s(?: 00905 \xE3\x80\xB1 | # U+3031 00906 \xE3\x82\x9D | # U+309D 00907 \xE3\x83\xBC | # U+30FC 00908 \xE3\x83\xBD | # U+30FD 00909 \xEF\xB9\xBC | # U+FE7C 00910 \xEF\xB9\xBD | # U+FE7D 00911 \xEF\xBD\xB0 # U+FF70 00912 )/ix', 00913 'ss', 00914 $value 00915 ); 00916 00917 return $value; 00918 } 00919 00920 00939 static function checkCss( $value ) { 00940 $value = self::normalizeCss( $value ); 00941 00942 // Reject problematic keywords and control characters 00943 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) { 00944 return '/* invalid control char */'; 00945 } elseif ( preg_match( 00946 '! expression 00947 | filter\s*: 00948 | accelerator\s*: 00949 | -o-link\s*: 00950 | -o-link-source\s*: 00951 | -o-replace\s*: 00952 | url\s*\( 00953 | image\s*\( 00954 | image-set\s*\( 00955 !ix', $value ) ) { 00956 return '/* insecure input */'; 00957 } 00958 return $value; 00959 } 00960 00965 static function cssDecodeCallback( $matches ) { 00966 if ( $matches[1] !== '' ) { 00967 // Line continuation 00968 return ''; 00969 } elseif ( $matches[2] !== '' ) { 00970 $char = codepointToUtf8( hexdec( $matches[2] ) ); 00971 } elseif ( $matches[3] !== '' ) { 00972 $char = $matches[3]; 00973 } else { 00974 $char = '\\'; 00975 } 00976 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { 00977 // These characters need to be escaped in strings 00978 // Clean up the escape sequence to avoid parsing errors by clients 00979 return '\\' . dechex( ord( $char ) ) . ' '; 00980 } else { 00981 // Decode unnecessary escape 00982 return $char; 00983 } 00984 } 00985 01005 static function fixTagAttributes( $text, $element ) { 01006 if ( trim( $text ) == '' ) { 01007 return ''; 01008 } 01009 01010 $decoded = Sanitizer::decodeTagAttributes( $text ); 01011 $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); 01012 01013 return Sanitizer::safeEncodeTagAttributes( $stripped ); 01014 } 01015 01021 static function encodeAttribute( $text ) { 01022 $encValue = htmlspecialchars( $text, ENT_QUOTES ); 01023 01024 // Whitespace is normalized during attribute decoding, 01025 // so if we've been passed non-spaces we must encode them 01026 // ahead of time or they won't be preserved. 01027 $encValue = strtr( $encValue, array( 01028 "\n" => ' ', 01029 "\r" => ' ', 01030 "\t" => '	', 01031 ) ); 01032 01033 return $encValue; 01034 } 01035 01042 static function safeEncodeAttribute( $text ) { 01043 $encValue = Sanitizer::encodeAttribute( $text ); 01044 01045 # Templates and links may be expanded in later parsing, 01046 # creating invalid or dangerous output. Suppress this. 01047 $encValue = strtr( $encValue, array( 01048 '<' => '<', // This should never happen, 01049 '>' => '>', // we've received invalid input 01050 '"' => '"', // which should have been escaped. 01051 '{' => '{', 01052 '[' => '[', 01053 "''" => '''', 01054 'ISBN' => 'ISBN', 01055 'RFC' => 'RFC', 01056 'PMID' => 'PMID', 01057 '|' => '|', 01058 '__' => '__', 01059 ) ); 01060 01061 # Stupid hack 01062 $encValue = preg_replace_callback( 01063 '/((?i)' . wfUrlProtocols() . ')/', 01064 array( 'Sanitizer', 'armorLinksCallback' ), 01065 $encValue ); 01066 return $encValue; 01067 } 01068 01100 static function escapeId( $id, $options = array() ) { 01101 global $wgExperimentalHtmlIds; 01102 $options = (array)$options; 01103 01104 if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { 01105 $id = Sanitizer::decodeCharReferences( $id ); 01106 $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); 01107 $id = trim( $id, '_' ); 01108 if ( $id === '' ) { 01109 # Must have been all whitespace to start with. 01110 return '_'; 01111 } else { 01112 return $id; 01113 } 01114 } 01115 01116 # HTML4-style escaping 01117 static $replace = array( 01118 '%3A' => ':', 01119 '%' => '.' 01120 ); 01121 01122 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); 01123 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); 01124 01125 if ( !preg_match( '/^[a-zA-Z]/', $id ) 01126 && !in_array( 'noninitial', $options ) ) { 01127 // Initial character must be a letter! 01128 $id = "x$id"; 01129 } 01130 return $id; 01131 } 01132 01144 static function escapeClass( $class ) { 01145 // Convert ugly stuff to underscores and kill underscores in ugly places 01146 return rtrim( preg_replace( 01147 array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ), 01148 '_', 01149 $class ), '_' ); 01150 } 01151 01159 static function escapeHtmlAllowEntities( $html ) { 01160 $html = Sanitizer::decodeCharReferences( $html ); 01161 # It seems wise to escape ' as well as ", as a matter of course. Can't 01162 # hurt. 01163 $html = htmlspecialchars( $html, ENT_QUOTES ); 01164 return $html; 01165 } 01166 01172 private static function armorLinksCallback( $matches ) { 01173 return str_replace( ':', ':', $matches[1] ); 01174 } 01175 01184 public static function decodeTagAttributes( $text ) { 01185 if ( trim( $text ) == '' ) { 01186 return array(); 01187 } 01188 01189 $attribs = array(); 01190 $pairs = array(); 01191 if ( !preg_match_all( 01192 self::getAttribsRegex(), 01193 $text, 01194 $pairs, 01195 PREG_SET_ORDER ) ) { 01196 return $attribs; 01197 } 01198 01199 foreach ( $pairs as $set ) { 01200 $attribute = strtolower( $set[1] ); 01201 $value = Sanitizer::getTagAttributeCallback( $set ); 01202 01203 // Normalize whitespace 01204 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); 01205 $value = trim( $value ); 01206 01207 // Decode character references 01208 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); 01209 } 01210 return $attribs; 01211 } 01212 01220 public static function safeEncodeTagAttributes( $assoc_array ) { 01221 $attribs = array(); 01222 foreach ( $assoc_array as $attribute => $value ) { 01223 $encAttribute = htmlspecialchars( $attribute ); 01224 $encValue = Sanitizer::safeEncodeAttribute( $value ); 01225 01226 $attribs[] = "$encAttribute=\"$encValue\""; 01227 } 01228 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; 01229 } 01230 01239 private static function getTagAttributeCallback( $set ) { 01240 if ( isset( $set[6] ) ) { 01241 # Illegal #XXXXXX color with no quotes. 01242 return $set[6]; 01243 } elseif ( isset( $set[5] ) ) { 01244 # No quotes. 01245 return $set[5]; 01246 } elseif ( isset( $set[4] ) ) { 01247 # Single-quoted 01248 return $set[4]; 01249 } elseif ( isset( $set[3] ) ) { 01250 # Double-quoted 01251 return $set[3]; 01252 } elseif ( !isset( $set[2] ) ) { 01253 # In XHTML, attributes must have a value. 01254 # For 'reduced' form, return explicitly the attribute name here. 01255 return $set[1]; 01256 } else { 01257 throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); 01258 } 01259 } 01260 01272 private static function normalizeAttributeValue( $text ) { 01273 return str_replace( '"', '"', 01274 self::normalizeWhitespace( 01275 Sanitizer::normalizeCharReferences( $text ) ) ); 01276 } 01277 01282 private static function normalizeWhitespace( $text ) { 01283 return preg_replace( 01284 '/\r\n|[\x20\x0d\x0a\x09]/', 01285 ' ', 01286 $text ); 01287 } 01288 01297 static function normalizeSectionNameWhitespace( $section ) { 01298 return trim( preg_replace( '/[ _]+/', ' ', $section ) ); 01299 } 01300 01316 static function normalizeCharReferences( $text ) { 01317 return preg_replace_callback( 01318 self::CHAR_REFS_REGEX, 01319 array( 'Sanitizer', 'normalizeCharReferencesCallback' ), 01320 $text ); 01321 } 01326 static function normalizeCharReferencesCallback( $matches ) { 01327 $ret = null; 01328 if ( $matches[1] != '' ) { 01329 $ret = Sanitizer::normalizeEntity( $matches[1] ); 01330 } elseif ( $matches[2] != '' ) { 01331 $ret = Sanitizer::decCharReference( $matches[2] ); 01332 } elseif ( $matches[3] != '' ) { 01333 $ret = Sanitizer::hexCharReference( $matches[3] ); 01334 } 01335 if ( is_null( $ret ) ) { 01336 return htmlspecialchars( $matches[0] ); 01337 } else { 01338 return $ret; 01339 } 01340 } 01341 01352 static function normalizeEntity( $name ) { 01353 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01354 return '&' . self::$htmlEntityAliases[$name] . ';'; 01355 } elseif ( in_array( $name, 01356 array( 'lt', 'gt', 'amp', 'quot' ) ) ) { 01357 return "&$name;"; 01358 } elseif ( isset( self::$htmlEntities[$name] ) ) { 01359 return '&#' . self::$htmlEntities[$name] . ';'; 01360 } else { 01361 return "&$name;"; 01362 } 01363 } 01364 01369 static function decCharReference( $codepoint ) { 01370 $point = intval( $codepoint ); 01371 if ( Sanitizer::validateCodepoint( $point ) ) { 01372 return sprintf( '&#%d;', $point ); 01373 } else { 01374 return null; 01375 } 01376 } 01377 01382 static function hexCharReference( $codepoint ) { 01383 $point = hexdec( $codepoint ); 01384 if ( Sanitizer::validateCodepoint( $point ) ) { 01385 return sprintf( '&#x%x;', $point ); 01386 } else { 01387 return null; 01388 } 01389 } 01390 01396 private static function validateCodepoint( $codepoint ) { 01397 return $codepoint == 0x09 01398 || $codepoint == 0x0a 01399 || $codepoint == 0x0d 01400 || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff ) 01401 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) 01402 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); 01403 } 01404 01412 public static function decodeCharReferences( $text ) { 01413 return preg_replace_callback( 01414 self::CHAR_REFS_REGEX, 01415 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01416 $text ); 01417 } 01418 01429 public static function decodeCharReferencesAndNormalize( $text ) { 01430 global $wgContLang; 01431 $text = preg_replace_callback( 01432 self::CHAR_REFS_REGEX, 01433 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01434 $text, /* limit */ -1, $count ); 01435 01436 if ( $count ) { 01437 return $wgContLang->normalize( $text ); 01438 } else { 01439 return $text; 01440 } 01441 } 01442 01447 static function decodeCharReferencesCallback( $matches ) { 01448 if ( $matches[1] != '' ) { 01449 return Sanitizer::decodeEntity( $matches[1] ); 01450 } elseif ( $matches[2] != '' ) { 01451 return Sanitizer::decodeChar( intval( $matches[2] ) ); 01452 } elseif ( $matches[3] != '' ) { 01453 return Sanitizer::decodeChar( hexdec( $matches[3] ) ); 01454 } 01455 # Last case should be an ampersand by itself 01456 return $matches[0]; 01457 } 01458 01466 static function decodeChar( $codepoint ) { 01467 if ( Sanitizer::validateCodepoint( $codepoint ) ) { 01468 return codepointToUtf8( $codepoint ); 01469 } else { 01470 return UTF8_REPLACEMENT; 01471 } 01472 } 01473 01482 static function decodeEntity( $name ) { 01483 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01484 $name = self::$htmlEntityAliases[$name]; 01485 } 01486 if ( isset( self::$htmlEntities[$name] ) ) { 01487 return codepointToUtf8( self::$htmlEntities[$name] ); 01488 } else { 01489 return "&$name;"; 01490 } 01491 } 01492 01499 static function attributeWhitelist( $element ) { 01500 $list = Sanitizer::setupAttributeWhitelist(); 01501 return isset( $list[$element] ) 01502 ? $list[$element] 01503 : array(); 01504 } 01505 01511 static function setupAttributeWhitelist() { 01512 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes; 01513 01514 static $whitelist, $staticInitialised; 01515 $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) ); 01516 01517 if ( isset( $whitelist ) && $staticInitialised == $globalContext ) { 01518 return $whitelist; 01519 } 01520 01521 $common = array( 01522 # HTML 01523 'id', 01524 'class', 01525 'style', 01526 'lang', 01527 'dir', 01528 'title', 01529 01530 # WAI-ARIA 01531 'role', 01532 ); 01533 01534 if ( $wgAllowRdfaAttributes ) { 01535 # RDFa attributes as specified in section 9 of 01536 # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 01537 $common = array_merge( $common, array( 01538 'about', 'property', 'resource', 'datatype', 'typeof', 01539 ) ); 01540 } 01541 01542 if ( $wgAllowMicrodataAttributes ) { 01543 # add HTML5 microdata tags as specified by 01544 # http://www.whatwg.org/html/microdata.html#the-microdata-model 01545 $common = array_merge( $common, array( 01546 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype' 01547 ) ); 01548 } 01549 01550 $block = array_merge( $common, array( 'align' ) ); 01551 $tablealign = array( 'align', 'char', 'charoff', 'valign' ); 01552 $tablecell = array( 01553 'abbr', 01554 'axis', 01555 'headers', 01556 'scope', 01557 'rowspan', 01558 'colspan', 01559 'nowrap', # deprecated 01560 'width', # deprecated 01561 'height', # deprecated 01562 'bgcolor', # deprecated 01563 ); 01564 01565 # Numbers refer to sections in HTML 4.01 standard describing the element. 01566 # See: http://www.w3.org/TR/html4/ 01567 $whitelist = array( 01568 # 7.5.4 01569 'div' => $block, 01570 'center' => $common, # deprecated 01571 'span' => $block, # ?? 01572 01573 # 7.5.5 01574 'h1' => $block, 01575 'h2' => $block, 01576 'h3' => $block, 01577 'h4' => $block, 01578 'h5' => $block, 01579 'h6' => $block, 01580 01581 # 7.5.6 01582 # address 01583 01584 # 8.2.4 01585 # bdo 01586 01587 # 9.2.1 01588 'em' => $common, 01589 'strong' => $common, 01590 'cite' => $common, 01591 'dfn' => $common, 01592 'code' => $common, 01593 'samp' => $common, 01594 'kbd' => $common, 01595 'var' => $common, 01596 'abbr' => $common, 01597 # acronym 01598 01599 # 9.2.2 01600 'blockquote' => array_merge( $common, array( 'cite' ) ), 01601 # q 01602 01603 # 9.2.3 01604 'sub' => $common, 01605 'sup' => $common, 01606 01607 # 9.3.1 01608 'p' => $block, 01609 01610 # 9.3.2 01611 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), 01612 01613 # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element 01614 'wbr' => array( 'id', 'class', 'title', 'style' ), 01615 01616 # 9.3.4 01617 'pre' => array_merge( $common, array( 'width' ) ), 01618 01619 # 9.4 01620 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), 01621 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), 01622 01623 # 10.2 01624 'ul' => array_merge( $common, array( 'type' ) ), 01625 'ol' => array_merge( $common, array( 'type', 'start' ) ), 01626 'li' => array_merge( $common, array( 'type', 'value' ) ), 01627 01628 # 10.3 01629 'dl' => $common, 01630 'dd' => $common, 01631 'dt' => $common, 01632 01633 # 11.2.1 01634 'table' => array_merge( $common, 01635 array( 'summary', 'width', 'border', 'frame', 01636 'rules', 'cellspacing', 'cellpadding', 01637 'align', 'bgcolor', 01638 ) ), 01639 01640 # 11.2.2 01641 'caption' => array_merge( $common, array( 'align' ) ), 01642 01643 # 11.2.3 01644 'thead' => array_merge( $common, $tablealign ), 01645 'tfoot' => array_merge( $common, $tablealign ), 01646 'tbody' => array_merge( $common, $tablealign ), 01647 01648 # 11.2.4 01649 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), 01650 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), 01651 01652 # 11.2.5 01653 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), 01654 01655 # 11.2.6 01656 'td' => array_merge( $common, $tablecell, $tablealign ), 01657 'th' => array_merge( $common, $tablecell, $tablealign ), 01658 01659 # 12.2 01660 # NOTE: <a> is not allowed directly, but the attrib 01661 # whitelist is used from the Parser object 01662 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa 01663 01664 # 13.2 01665 # Not usually allowed, but may be used for extension-style hooks 01666 # such as <math> when it is rasterized, or if $wgAllowImageTag is 01667 # true 01668 'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ), 01669 01670 # 15.2.1 01671 'tt' => $common, 01672 'b' => $common, 01673 'i' => $common, 01674 'big' => $common, 01675 'small' => $common, 01676 'strike' => $common, 01677 's' => $common, 01678 'u' => $common, 01679 01680 # 15.2.2 01681 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), 01682 # basefont 01683 01684 # 15.3 01685 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), 01686 01687 # HTML Ruby annotation text module, simple ruby only. 01688 # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element 01689 'ruby' => $common, 01690 # rbc 01691 # rtc 01692 'rb' => $common, 01693 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), 01694 'rp' => $common, 01695 01696 # MathML root element, where used for extensions 01697 # 'title' may not be 100% valid here; it's XHTML 01698 # http://www.w3.org/TR/REC-MathML/ 01699 'math' => array( 'class', 'style', 'id', 'title' ), 01700 01701 # HTML 5 section 4.6 01702 'bdi' => $common, 01703 01704 # HTML5 elements, defined by: 01705 # http://www.whatwg.org/html/ 01706 'data' => array_merge( $common, array( 'value' ) ), 01707 'time' => array_merge( $common, array( 'datetime' ) ), 01708 'mark' => $common, 01709 01710 // meta and link are only permitted by removeHTMLtags when Microdata 01711 // is enabled so we don't bother adding a conditional to hide these 01712 // Also meta and link are only valid in WikiText as Microdata elements 01713 // (ie: validateTag rejects tags missing the attributes needed for Microdata) 01714 // So we don't bother including $common attributes that have no purpose. 01715 'meta' => array( 'itemprop', 'content' ), 01716 'link' => array( 'itemprop', 'href' ), 01717 ); 01718 01719 $staticInitialised = $globalContext; 01720 01721 return $whitelist; 01722 } 01723 01734 static function stripAllTags( $text ) { 01735 # Actual <tags> 01736 $text = StringUtils::delimiterReplace( '<', '>', '', $text ); 01737 01738 # Normalize &entities and whitespace 01739 $text = self::decodeCharReferences( $text ); 01740 $text = self::normalizeWhitespace( $text ); 01741 01742 return $text; 01743 } 01744 01754 static function hackDocType() { 01755 $out = "<!DOCTYPE html [\n"; 01756 foreach ( self::$htmlEntities as $entity => $codepoint ) { 01757 $out .= "<!ENTITY $entity \"&#$codepoint;\">"; 01758 } 01759 $out .= "]>\n"; 01760 return $out; 01761 } 01762 01767 static function cleanUrl( $url ) { 01768 # Normalize any HTML entities in input. They will be 01769 # re-escaped by makeExternalLink(). 01770 $url = Sanitizer::decodeCharReferences( $url ); 01771 01772 # Escape any control characters introduced by the above step 01773 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', 01774 array( __CLASS__, 'cleanUrlCallback' ), $url ); 01775 01776 # Validate hostname portion 01777 $matches = array(); 01778 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { 01779 list( /* $whole */, $protocol, $host, $rest ) = $matches; 01780 01781 // Characters that will be ignored in IDNs. 01782 // http://tools.ietf.org/html/3454#section-3.1 01783 // Strip them before further processing so blacklists and such work. 01784 $strip = "/ 01785 \\s| # general whitespace 01786 \xc2\xad| # 00ad SOFT HYPHEN 01787 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 01788 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 01789 \xe2\x81\xa0| # 2060 WORD JOINER 01790 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 01791 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER 01792 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 01793 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 01794 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 01795 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 01796 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 01797 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16 01798 /xuD"; 01799 01800 $host = preg_replace( $strip, '', $host ); 01801 01802 // @todo FIXME: Validate hostnames here 01803 01804 return $protocol . $host . $rest; 01805 } else { 01806 return $url; 01807 } 01808 } 01809 01814 static function cleanUrlCallback( $matches ) { 01815 return urlencode( $matches[0] ); 01816 } 01817 01846 public static function validateEmail( $addr ) { 01847 $result = null; 01848 if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) { 01849 return $result; 01850 } 01851 01852 // Please note strings below are enclosed in brackets [], this make the 01853 // hyphen "-" a range indicator. Hence it is double backslashed below. 01854 // See bug 26948 01855 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~"; 01856 $rfc1034_ldh_str = "a-z0-9\\-"; 01857 01858 $HTML5_email_regexp = "/ 01859 ^ # start of string 01860 [$rfc5322_atext\\.]+ # user part which is liberal :p 01861 @ # 'apostrophe' 01862 [$rfc1034_ldh_str]+ # First domain part 01863 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot 01864 $ # End of string 01865 /ix"; // case Insensitive, eXtended 01866 01867 return (bool)preg_match( $HTML5_email_regexp, $addr ); 01868 } 01869 }