MediaWiki
REL1_20
|
00001 <?php 00031 class Sanitizer { 00036 const CHAR_REFS_REGEX = 00037 '/&([A-Za-z0-9\x80-\xff]+); 00038 |&\#([0-9]+); 00039 |&\#[xX]([0-9A-Fa-f]+); 00040 |(&)/x'; 00041 00050 const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; 00051 const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; 00052 00059 static $htmlEntities = array( 00060 'Aacute' => 193, 00061 'aacute' => 225, 00062 'Acirc' => 194, 00063 'acirc' => 226, 00064 'acute' => 180, 00065 'AElig' => 198, 00066 'aelig' => 230, 00067 'Agrave' => 192, 00068 'agrave' => 224, 00069 'alefsym' => 8501, 00070 'Alpha' => 913, 00071 'alpha' => 945, 00072 'amp' => 38, 00073 'and' => 8743, 00074 'ang' => 8736, 00075 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. 00076 'Aring' => 197, 00077 'aring' => 229, 00078 'asymp' => 8776, 00079 'Atilde' => 195, 00080 'atilde' => 227, 00081 'Auml' => 196, 00082 'auml' => 228, 00083 'bdquo' => 8222, 00084 'Beta' => 914, 00085 'beta' => 946, 00086 'brvbar' => 166, 00087 'bull' => 8226, 00088 'cap' => 8745, 00089 'Ccedil' => 199, 00090 'ccedil' => 231, 00091 'cedil' => 184, 00092 'cent' => 162, 00093 'Chi' => 935, 00094 'chi' => 967, 00095 'circ' => 710, 00096 'clubs' => 9827, 00097 'cong' => 8773, 00098 'copy' => 169, 00099 'crarr' => 8629, 00100 'cup' => 8746, 00101 'curren' => 164, 00102 'dagger' => 8224, 00103 'Dagger' => 8225, 00104 'darr' => 8595, 00105 'dArr' => 8659, 00106 'deg' => 176, 00107 'Delta' => 916, 00108 'delta' => 948, 00109 'diams' => 9830, 00110 'divide' => 247, 00111 'Eacute' => 201, 00112 'eacute' => 233, 00113 'Ecirc' => 202, 00114 'ecirc' => 234, 00115 'Egrave' => 200, 00116 'egrave' => 232, 00117 'empty' => 8709, 00118 'emsp' => 8195, 00119 'ensp' => 8194, 00120 'Epsilon' => 917, 00121 'epsilon' => 949, 00122 'equiv' => 8801, 00123 'Eta' => 919, 00124 'eta' => 951, 00125 'ETH' => 208, 00126 'eth' => 240, 00127 'Euml' => 203, 00128 'euml' => 235, 00129 'euro' => 8364, 00130 'exist' => 8707, 00131 'fnof' => 402, 00132 'forall' => 8704, 00133 'frac12' => 189, 00134 'frac14' => 188, 00135 'frac34' => 190, 00136 'frasl' => 8260, 00137 'Gamma' => 915, 00138 'gamma' => 947, 00139 'ge' => 8805, 00140 'gt' => 62, 00141 'harr' => 8596, 00142 'hArr' => 8660, 00143 'hearts' => 9829, 00144 'hellip' => 8230, 00145 'Iacute' => 205, 00146 'iacute' => 237, 00147 'Icirc' => 206, 00148 'icirc' => 238, 00149 'iexcl' => 161, 00150 'Igrave' => 204, 00151 'igrave' => 236, 00152 'image' => 8465, 00153 'infin' => 8734, 00154 'int' => 8747, 00155 'Iota' => 921, 00156 'iota' => 953, 00157 'iquest' => 191, 00158 'isin' => 8712, 00159 'Iuml' => 207, 00160 'iuml' => 239, 00161 'Kappa' => 922, 00162 'kappa' => 954, 00163 'Lambda' => 923, 00164 'lambda' => 955, 00165 'lang' => 9001, 00166 'laquo' => 171, 00167 'larr' => 8592, 00168 'lArr' => 8656, 00169 'lceil' => 8968, 00170 'ldquo' => 8220, 00171 'le' => 8804, 00172 'lfloor' => 8970, 00173 'lowast' => 8727, 00174 'loz' => 9674, 00175 'lrm' => 8206, 00176 'lsaquo' => 8249, 00177 'lsquo' => 8216, 00178 'lt' => 60, 00179 'macr' => 175, 00180 'mdash' => 8212, 00181 'micro' => 181, 00182 'middot' => 183, 00183 'minus' => 8722, 00184 'Mu' => 924, 00185 'mu' => 956, 00186 'nabla' => 8711, 00187 'nbsp' => 160, 00188 'ndash' => 8211, 00189 'ne' => 8800, 00190 'ni' => 8715, 00191 'not' => 172, 00192 'notin' => 8713, 00193 'nsub' => 8836, 00194 'Ntilde' => 209, 00195 'ntilde' => 241, 00196 'Nu' => 925, 00197 'nu' => 957, 00198 'Oacute' => 211, 00199 'oacute' => 243, 00200 'Ocirc' => 212, 00201 'ocirc' => 244, 00202 'OElig' => 338, 00203 'oelig' => 339, 00204 'Ograve' => 210, 00205 'ograve' => 242, 00206 'oline' => 8254, 00207 'Omega' => 937, 00208 'omega' => 969, 00209 'Omicron' => 927, 00210 'omicron' => 959, 00211 'oplus' => 8853, 00212 'or' => 8744, 00213 'ordf' => 170, 00214 'ordm' => 186, 00215 'Oslash' => 216, 00216 'oslash' => 248, 00217 'Otilde' => 213, 00218 'otilde' => 245, 00219 'otimes' => 8855, 00220 'Ouml' => 214, 00221 'ouml' => 246, 00222 'para' => 182, 00223 'part' => 8706, 00224 'permil' => 8240, 00225 'perp' => 8869, 00226 'Phi' => 934, 00227 'phi' => 966, 00228 'Pi' => 928, 00229 'pi' => 960, 00230 'piv' => 982, 00231 'plusmn' => 177, 00232 'pound' => 163, 00233 'prime' => 8242, 00234 'Prime' => 8243, 00235 'prod' => 8719, 00236 'prop' => 8733, 00237 'Psi' => 936, 00238 'psi' => 968, 00239 'quot' => 34, 00240 'radic' => 8730, 00241 'rang' => 9002, 00242 'raquo' => 187, 00243 'rarr' => 8594, 00244 'rArr' => 8658, 00245 'rceil' => 8969, 00246 'rdquo' => 8221, 00247 'real' => 8476, 00248 'reg' => 174, 00249 'rfloor' => 8971, 00250 'Rho' => 929, 00251 'rho' => 961, 00252 'rlm' => 8207, 00253 'rsaquo' => 8250, 00254 'rsquo' => 8217, 00255 'sbquo' => 8218, 00256 'Scaron' => 352, 00257 'scaron' => 353, 00258 'sdot' => 8901, 00259 'sect' => 167, 00260 'shy' => 173, 00261 'Sigma' => 931, 00262 'sigma' => 963, 00263 'sigmaf' => 962, 00264 'sim' => 8764, 00265 'spades' => 9824, 00266 'sub' => 8834, 00267 'sube' => 8838, 00268 'sum' => 8721, 00269 'sup' => 8835, 00270 'sup1' => 185, 00271 'sup2' => 178, 00272 'sup3' => 179, 00273 'supe' => 8839, 00274 'szlig' => 223, 00275 'Tau' => 932, 00276 'tau' => 964, 00277 'there4' => 8756, 00278 'Theta' => 920, 00279 'theta' => 952, 00280 'thetasym' => 977, 00281 'thinsp' => 8201, 00282 'THORN' => 222, 00283 'thorn' => 254, 00284 'tilde' => 732, 00285 'times' => 215, 00286 'trade' => 8482, 00287 'Uacute' => 218, 00288 'uacute' => 250, 00289 'uarr' => 8593, 00290 'uArr' => 8657, 00291 'Ucirc' => 219, 00292 'ucirc' => 251, 00293 'Ugrave' => 217, 00294 'ugrave' => 249, 00295 'uml' => 168, 00296 'upsih' => 978, 00297 'Upsilon' => 933, 00298 'upsilon' => 965, 00299 'Uuml' => 220, 00300 'uuml' => 252, 00301 'weierp' => 8472, 00302 'Xi' => 926, 00303 'xi' => 958, 00304 'Yacute' => 221, 00305 'yacute' => 253, 00306 'yen' => 165, 00307 'Yuml' => 376, 00308 'yuml' => 255, 00309 'Zeta' => 918, 00310 'zeta' => 950, 00311 'zwj' => 8205, 00312 'zwnj' => 8204 00313 ); 00314 00318 static $htmlEntityAliases = array( 00319 'רלמ' => 'rlm', 00320 'رلم' => 'rlm', 00321 ); 00322 00326 static $attribsRegex; 00327 00333 static function getAttribsRegex() { 00334 if ( self::$attribsRegex === null ) { 00335 $attribFirst = '[:A-Z_a-z0-9]'; 00336 $attrib = '[:A-Z_a-z-.0-9]'; 00337 $space = '[\x09\x0a\x0d\x20]'; 00338 self::$attribsRegex = 00339 "/(?:^|$space)({$attribFirst}{$attrib}*) 00340 ($space*=$space* 00341 (?: 00342 # The attribute value: quoted or alone 00343 \"([^<\"]*)\" 00344 | '([^<']*)' 00345 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) 00346 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of 00347 # colors are specified like this. 00348 # We'll be normalizing it. 00349 ) 00350 )?(?=$space|\$)/sx"; 00351 } 00352 return self::$attribsRegex; 00353 } 00354 00366 static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) { 00367 global $wgUseTidy; 00368 00369 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, 00370 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; 00371 00372 wfProfileIn( __METHOD__ ); 00373 00374 if ( !$staticInitialised ) { 00375 00376 $htmlpairsStatic = array( # Tags that must be closed 00377 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 00378 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 00379 'strike', 'strong', 'tt', 'var', 'div', 'center', 00380 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 00381 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn', 00382 'kbd', 'samp' 00383 ); 00384 $htmlsingle = array( 00385 'br', 'hr', 'li', 'dt', 'dd' 00386 ); 00387 $htmlsingleonly = array( # Elements that cannot have close tags 00388 'br', 'hr' 00389 ); 00390 $htmlnest = array( # Tags that can be nested--?? 00391 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 00392 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' 00393 ); 00394 $tabletags = array( # Can only appear inside table, we will close them 00395 'td', 'th', 'tr', 00396 ); 00397 $htmllist = array( # Tags used by list 00398 'ul','ol', 00399 ); 00400 $listtags = array( # Tags that can appear in a list 00401 'li', 00402 ); 00403 00404 global $wgAllowImageTag; 00405 if ( $wgAllowImageTag ) { 00406 $htmlsingle[] = 'img'; 00407 $htmlsingleonly[] = 'img'; 00408 } 00409 00410 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); 00411 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); 00412 00413 # Convert them all to hashtables for faster lookup 00414 $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 00415 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ); 00416 foreach ( $vars as $var ) { 00417 $$var = array_flip( $$var ); 00418 } 00419 $staticInitialised = true; 00420 } 00421 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays 00422 $extratags = array_flip( $extratags ); 00423 $removetags = array_flip( $removetags ); 00424 $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); 00425 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags ); 00426 00427 # Remove HTML comments 00428 $text = Sanitizer::removeHTMLcomments( $text ); 00429 $bits = explode( '<', $text ); 00430 $text = str_replace( '>', '>', array_shift( $bits ) ); 00431 if ( !$wgUseTidy ) { 00432 $tagstack = $tablestack = array(); 00433 foreach ( $bits as $x ) { 00434 $regs = array(); 00435 # $slash: Does the current element start with a '/'? 00436 # $t: Current element name 00437 # $params: String between element name and > 00438 # $brace: Ending '>' or '/>' 00439 # $rest: Everything until the next element of $bits 00440 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { 00441 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00442 } else { 00443 $slash = $t = $params = $brace = $rest = null; 00444 } 00445 00446 $badtag = false; 00447 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00448 # Check our stack 00449 if ( $slash && isset( $htmlsingleonly[$t] ) ) { 00450 $badtag = true; 00451 } elseif ( $slash ) { 00452 # Closing a tag... is it the one we just opened? 00453 $ot = @array_pop( $tagstack ); 00454 if ( $ot != $t ) { 00455 if ( isset( $htmlsingleallowed[$ot] ) ) { 00456 # Pop all elements with an optional close tag 00457 # and see if we find a match below them 00458 $optstack = array(); 00459 array_push( $optstack, $ot ); 00460 wfSuppressWarnings(); 00461 $ot = array_pop( $tagstack ); 00462 wfRestoreWarnings(); 00463 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { 00464 array_push( $optstack, $ot ); 00465 wfSuppressWarnings(); 00466 $ot = array_pop( $tagstack ); 00467 wfRestoreWarnings(); 00468 } 00469 if ( $t != $ot ) { 00470 # No match. Push the optional elements back again 00471 $badtag = true; 00472 wfSuppressWarnings(); 00473 $ot = array_pop( $optstack ); 00474 wfRestoreWarnings(); 00475 while ( $ot ) { 00476 array_push( $tagstack, $ot ); 00477 wfSuppressWarnings(); 00478 $ot = array_pop( $optstack ); 00479 wfRestoreWarnings(); 00480 } 00481 } 00482 } else { 00483 @array_push( $tagstack, $ot ); 00484 # <li> can be nested in <ul> or <ol>, skip those cases: 00485 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) { 00486 $badtag = true; 00487 } 00488 } 00489 } else { 00490 if ( $t == 'table' ) { 00491 $tagstack = array_pop( $tablestack ); 00492 } 00493 } 00494 $newparams = ''; 00495 } else { 00496 # Keep track for later 00497 if ( isset( $tabletags[$t] ) && 00498 !in_array( 'table', $tagstack ) ) { 00499 $badtag = true; 00500 } elseif ( in_array( $t, $tagstack ) && 00501 !isset( $htmlnest [$t ] ) ) { 00502 $badtag = true; 00503 # Is it a self closed htmlpair ? (bug 5487) 00504 } elseif ( $brace == '/>' && 00505 isset( $htmlpairs[$t] ) ) { 00506 $badtag = true; 00507 } elseif ( isset( $htmlsingleonly[$t] ) ) { 00508 # Hack to force empty tag for uncloseable elements 00509 $brace = '/>'; 00510 } elseif ( isset( $htmlsingle[$t] ) ) { 00511 # Hack to not close $htmlsingle tags 00512 $brace = null; 00513 } elseif ( isset( $tabletags[$t] ) 00514 && in_array( $t, $tagstack ) ) { 00515 // New table tag but forgot to close the previous one 00516 $text .= "</$t>"; 00517 } else { 00518 if ( $t == 'table' ) { 00519 array_push( $tablestack, $tagstack ); 00520 $tagstack = array(); 00521 } 00522 array_push( $tagstack, $t ); 00523 } 00524 00525 # Replace any variables or template parameters with 00526 # plaintext results. 00527 if( is_callable( $processCallback ) ) { 00528 call_user_func_array( $processCallback, array( &$params, $args ) ); 00529 } 00530 00531 # Strip non-approved attributes from the tag 00532 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00533 } 00534 if ( !$badtag ) { 00535 $rest = str_replace( '>', '>', $rest ); 00536 $close = ( $brace == '/>' && !$slash ) ? ' /' : ''; 00537 $text .= "<$slash$t$newparams$close>$rest"; 00538 continue; 00539 } 00540 } 00541 $text .= '<' . str_replace( '>', '>', $x); 00542 } 00543 # Close off any remaining tags 00544 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) { 00545 $text .= "</$t>\n"; 00546 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); } 00547 } 00548 } else { 00549 # this might be possible using tidy itself 00550 foreach ( $bits as $x ) { 00551 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', 00552 $x, $regs ); 00553 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00554 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00555 if( is_callable( $processCallback ) ) { 00556 call_user_func_array( $processCallback, array( &$params, $args ) ); 00557 } 00558 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00559 $rest = str_replace( '>', '>', $rest ); 00560 $text .= "<$slash$t$newparams$brace$rest"; 00561 } else { 00562 $text .= '<' . str_replace( '>', '>', $x); 00563 } 00564 } 00565 } 00566 wfProfileOut( __METHOD__ ); 00567 return $text; 00568 } 00569 00580 static function removeHTMLcomments( $text ) { 00581 wfProfileIn( __METHOD__ ); 00582 while (($start = strpos($text, '<!--')) !== false) { 00583 $end = strpos($text, '-->', $start + 4); 00584 if ($end === false) { 00585 # Unterminated comment; bail out 00586 break; 00587 } 00588 00589 $end += 3; 00590 00591 # Trim space and newline if the comment is both 00592 # preceded and followed by a newline 00593 $spaceStart = max($start - 1, 0); 00594 $spaceLen = $end - $spaceStart; 00595 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) { 00596 $spaceStart--; 00597 $spaceLen++; 00598 } 00599 while (substr($text, $spaceStart + $spaceLen, 1) === ' ') 00600 $spaceLen++; 00601 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") { 00602 # Remove the comment, leading and trailing 00603 # spaces, and leave only one newline. 00604 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1); 00605 } 00606 else { 00607 # Remove just the comment. 00608 $text = substr_replace($text, '', $start, $end - $start); 00609 } 00610 } 00611 wfProfileOut( __METHOD__ ); 00612 return $text; 00613 } 00614 00630 static function validateTagAttributes( $attribs, $element ) { 00631 return Sanitizer::validateAttributes( $attribs, 00632 Sanitizer::attributeWhitelist( $element ) ); 00633 } 00634 00650 static function validateAttributes( $attribs, $whitelist ) { 00651 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5; 00652 00653 $whitelist = array_flip( $whitelist ); 00654 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; 00655 00656 $out = array(); 00657 foreach( $attribs as $attribute => $value ) { 00658 #allow XML namespace declaration if RDFa is enabled 00659 if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { 00660 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00661 $out[$attribute] = $value; 00662 } 00663 00664 continue; 00665 } 00666 00667 # Allow any attribute beginning with "data-", if in HTML5 mode 00668 if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) { 00669 continue; 00670 } 00671 00672 # Strip javascript "expression" from stylesheets. 00673 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp 00674 if( $attribute == 'style' ) { 00675 $value = Sanitizer::checkCss( $value ); 00676 } 00677 00678 if ( $attribute === 'id' ) { 00679 $value = Sanitizer::escapeId( $value, 'noninitial' ); 00680 } 00681 00682 //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity 00683 if ( $attribute === 'rel' || $attribute === 'rev' || 00684 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa 00685 $attribute === 'datatype' || $attribute === 'typeof' || #RDFa 00686 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata 00687 $attribute === 'itemscope' || $attribute === 'itemtype' ) { #HTML5 microdata 00688 00689 //Paranoia. Allow "simple" values but suppress javascript 00690 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00691 continue; 00692 } 00693 } 00694 00695 # NOTE: even though elements using href/src are not allowed directly, supply 00696 # validation code that can be used by tag hook handlers, etc 00697 if ( $attribute === 'href' || $attribute === 'src' ) { 00698 if ( !preg_match( $hrefExp, $value ) ) { 00699 continue; //drop any href or src attributes not using an allowed protocol. 00700 //NOTE: this also drops all relative URLs 00701 } 00702 } 00703 00704 // If this attribute was previously set, override it. 00705 // Output should only have one attribute of each name. 00706 $out[$attribute] = $value; 00707 } 00708 00709 if ( $wgAllowMicrodataAttributes ) { 00710 # itemtype, itemid, itemref don't make sense without itemscope 00711 if ( !array_key_exists( 'itemscope', $out ) ) { 00712 unset( $out['itemtype'] ); 00713 unset( $out['itemid'] ); 00714 unset( $out['itemref'] ); 00715 } 00716 # TODO: Strip itemprop if we aren't descendants of an itemscope. 00717 } 00718 return $out; 00719 } 00720 00731 static function mergeAttributes( $a, $b ) { 00732 $out = array_merge( $a, $b ); 00733 if( isset( $a['class'] ) && isset( $b['class'] ) 00734 && is_string( $a['class'] ) && is_string( $b['class'] ) 00735 && $a['class'] !== $b['class'] ) { 00736 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", 00737 -1, PREG_SPLIT_NO_EMPTY ); 00738 $out['class'] = implode( ' ', array_unique( $classes ) ); 00739 } 00740 return $out; 00741 } 00742 00760 static function checkCss( $value ) { 00761 // Decode character references like { 00762 $value = Sanitizer::decodeCharReferences( $value ); 00763 00764 // Decode escape sequences and line continuation 00765 // See the grammar in the CSS 2 spec, appendix D. 00766 // This has to be done AFTER decoding character references. 00767 // This means it isn't possible for this function to return 00768 // unsanitized escape sequences. It is possible to manufacture 00769 // input that contains character references that decode to 00770 // escape sequences that decode to character references, but 00771 // it's OK for the return value to contain character references 00772 // because the caller is supposed to escape those anyway. 00773 static $decodeRegex; 00774 if ( !$decodeRegex ) { 00775 $space = '[\\x20\\t\\r\\n\\f]'; 00776 $nl = '(?:\\n|\\r\\n|\\r|\\f)'; 00777 $backslash = '\\\\'; 00778 $decodeRegex = "/ $backslash 00779 (?: 00780 ($nl) | # 1. Line continuation 00781 ([0-9A-Fa-f]{1,6})$space? | # 2. character number 00782 (.) | # 3. backslash cancelling special meaning 00783 () | # 4. backslash at end of string 00784 )/xu"; 00785 } 00786 $value = preg_replace_callback( $decodeRegex, 00787 array( __CLASS__, 'cssDecodeCallback' ), $value ); 00788 00789 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii 00790 $value = preg_replace_callback( 00791 '/[!-z]/u', // U+FF01 to U+FF5A 00792 function ( $matches ) { 00793 $cp = utf8ToCodepoint( $matches[0] ); 00794 if ( $cp === false ) { 00795 return ''; 00796 } 00797 return chr( $cp - 65248 ); // ASCII range \x21-\x7A 00798 }, 00799 $value 00800 ); 00801 00802 // Convert more characters IE6 might treat as ascii 00803 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D 00804 $value = str_replace( 00805 array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ), 00806 array( 'r', 'n', 'n', 'l', 'i', '(', '(' ), 00807 $value 00808 ); 00809 00810 // Remove any comments; IE gets token splitting wrong 00811 // This must be done AFTER decoding character references and 00812 // escape sequences, because those steps can introduce comments 00813 // This step cannot introduce character references or escape 00814 // sequences, because it replaces comments with spaces rather 00815 // than removing them completely. 00816 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); 00817 00818 // Remove anything after a comment-start token, to guard against 00819 // incorrect client implementations. 00820 $commentPos = strpos( $value, '/*' ); 00821 if ( $commentPos !== false ) { 00822 $value = substr( $value, 0, $commentPos ); 00823 } 00824 00825 // S followed by repeat, iteration, or prolonged sound marks, 00826 // which IE will treat as "ss" 00827 $value = preg_replace( 00828 '/s(?: 00829 \xE3\x80\xB1 | # U+3031 00830 \xE3\x82\x9D | # U+309D 00831 \xE3\x83\xBC | # U+30FC 00832 \xE3\x83\xBD | # U+30FD 00833 \xEF\xB9\xBC | # U+FE7C 00834 \xEF\xB9\xBD | # U+FE7D 00835 \xEF\xBD\xB0 # U+FF70 00836 )/ix', 00837 'ss', 00838 $value 00839 ); 00840 00841 // Reject problematic keywords and control characters 00842 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) { 00843 return '/* invalid control char */'; 00844 } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( !ix', $value ) ) { 00845 return '/* insecure input */'; 00846 } 00847 return $value; 00848 } 00849 00854 static function cssDecodeCallback( $matches ) { 00855 if ( $matches[1] !== '' ) { 00856 // Line continuation 00857 return ''; 00858 } elseif ( $matches[2] !== '' ) { 00859 $char = codepointToUtf8( hexdec( $matches[2] ) ); 00860 } elseif ( $matches[3] !== '' ) { 00861 $char = $matches[3]; 00862 } else { 00863 $char = '\\'; 00864 } 00865 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { 00866 // These characters need to be escaped in strings 00867 // Clean up the escape sequence to avoid parsing errors by clients 00868 return '\\' . dechex( ord( $char ) ) . ' '; 00869 } else { 00870 // Decode unnecessary escape 00871 return $char; 00872 } 00873 } 00874 00894 static function fixTagAttributes( $text, $element ) { 00895 if( trim( $text ) == '' ) { 00896 return ''; 00897 } 00898 00899 $decoded = Sanitizer::decodeTagAttributes( $text ); 00900 $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); 00901 00902 $attribs = array(); 00903 foreach( $stripped as $attribute => $value ) { 00904 $encAttribute = htmlspecialchars( $attribute ); 00905 $encValue = Sanitizer::safeEncodeAttribute( $value ); 00906 00907 $attribs[] = "$encAttribute=\"$encValue\""; 00908 } 00909 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; 00910 } 00911 00917 static function encodeAttribute( $text ) { 00918 $encValue = htmlspecialchars( $text, ENT_QUOTES ); 00919 00920 // Whitespace is normalized during attribute decoding, 00921 // so if we've been passed non-spaces we must encode them 00922 // ahead of time or they won't be preserved. 00923 $encValue = strtr( $encValue, array( 00924 "\n" => ' ', 00925 "\r" => ' ', 00926 "\t" => '	', 00927 ) ); 00928 00929 return $encValue; 00930 } 00931 00938 static function safeEncodeAttribute( $text ) { 00939 $encValue = Sanitizer::encodeAttribute( $text ); 00940 00941 # Templates and links may be expanded in later parsing, 00942 # creating invalid or dangerous output. Suppress this. 00943 $encValue = strtr( $encValue, array( 00944 '<' => '<', // This should never happen, 00945 '>' => '>', // we've received invalid input 00946 '"' => '"', // which should have been escaped. 00947 '{' => '{', 00948 '[' => '[', 00949 "''" => '''', 00950 'ISBN' => 'ISBN', 00951 'RFC' => 'RFC', 00952 'PMID' => 'PMID', 00953 '|' => '|', 00954 '__' => '__', 00955 ) ); 00956 00957 # Stupid hack 00958 $encValue = preg_replace_callback( 00959 '/((?i)' . wfUrlProtocols() . ')/', 00960 array( 'Sanitizer', 'armorLinksCallback' ), 00961 $encValue ); 00962 return $encValue; 00963 } 00964 00996 static function escapeId( $id, $options = array() ) { 00997 global $wgHtml5, $wgExperimentalHtmlIds; 00998 $options = (array)$options; 00999 01000 if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { 01001 $id = Sanitizer::decodeCharReferences( $id ); 01002 $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); 01003 $id = trim( $id, '_' ); 01004 if ( $id === '' ) { 01005 # Must have been all whitespace to start with. 01006 return '_'; 01007 } else { 01008 return $id; 01009 } 01010 } 01011 01012 # HTML4-style escaping 01013 static $replace = array( 01014 '%3A' => ':', 01015 '%' => '.' 01016 ); 01017 01018 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); 01019 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); 01020 01021 if ( !preg_match( '/^[a-zA-Z]/', $id ) 01022 && !in_array( 'noninitial', $options ) ) { 01023 // Initial character must be a letter! 01024 $id = "x$id"; 01025 } 01026 return $id; 01027 } 01028 01040 static function escapeClass( $class ) { 01041 // Convert ugly stuff to underscores and kill underscores in ugly places 01042 return rtrim(preg_replace( 01043 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'), 01044 '_', 01045 $class ), '_'); 01046 } 01047 01055 static function escapeHtmlAllowEntities( $html ) { 01056 $html = Sanitizer::decodeCharReferences( $html ); 01057 # It seems wise to escape ' as well as ", as a matter of course. Can't 01058 # hurt. 01059 $html = htmlspecialchars( $html, ENT_QUOTES ); 01060 return $html; 01061 } 01062 01068 private static function armorLinksCallback( $matches ) { 01069 return str_replace( ':', ':', $matches[1] ); 01070 } 01071 01080 public static function decodeTagAttributes( $text ) { 01081 if( trim( $text ) == '' ) { 01082 return array(); 01083 } 01084 01085 $attribs = array(); 01086 $pairs = array(); 01087 if( !preg_match_all( 01088 self::getAttribsRegex(), 01089 $text, 01090 $pairs, 01091 PREG_SET_ORDER ) ) { 01092 return $attribs; 01093 } 01094 01095 foreach( $pairs as $set ) { 01096 $attribute = strtolower( $set[1] ); 01097 $value = Sanitizer::getTagAttributeCallback( $set ); 01098 01099 // Normalize whitespace 01100 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); 01101 $value = trim( $value ); 01102 01103 // Decode character references 01104 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); 01105 } 01106 return $attribs; 01107 } 01108 01116 private static function getTagAttributeCallback( $set ) { 01117 if( isset( $set[6] ) ) { 01118 # Illegal #XXXXXX color with no quotes. 01119 return $set[6]; 01120 } elseif( isset( $set[5] ) ) { 01121 # No quotes. 01122 return $set[5]; 01123 } elseif( isset( $set[4] ) ) { 01124 # Single-quoted 01125 return $set[4]; 01126 } elseif( isset( $set[3] ) ) { 01127 # Double-quoted 01128 return $set[3]; 01129 } elseif( !isset( $set[2] ) ) { 01130 # In XHTML, attributes must have a value. 01131 # For 'reduced' form, return explicitly the attribute name here. 01132 return $set[1]; 01133 } else { 01134 throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); 01135 } 01136 } 01137 01149 private static function normalizeAttributeValue( $text ) { 01150 return str_replace( '"', '"', 01151 self::normalizeWhitespace( 01152 Sanitizer::normalizeCharReferences( $text ) ) ); 01153 } 01154 01159 private static function normalizeWhitespace( $text ) { 01160 return preg_replace( 01161 '/\r\n|[\x20\x0d\x0a\x09]/', 01162 ' ', 01163 $text ); 01164 } 01165 01174 static function normalizeSectionNameWhitespace( $section ) { 01175 return trim( preg_replace( '/[ _]+/', ' ', $section ) ); 01176 } 01177 01193 static function normalizeCharReferences( $text ) { 01194 return preg_replace_callback( 01195 self::CHAR_REFS_REGEX, 01196 array( 'Sanitizer', 'normalizeCharReferencesCallback' ), 01197 $text ); 01198 } 01203 static function normalizeCharReferencesCallback( $matches ) { 01204 $ret = null; 01205 if( $matches[1] != '' ) { 01206 $ret = Sanitizer::normalizeEntity( $matches[1] ); 01207 } elseif( $matches[2] != '' ) { 01208 $ret = Sanitizer::decCharReference( $matches[2] ); 01209 } elseif( $matches[3] != '' ) { 01210 $ret = Sanitizer::hexCharReference( $matches[3] ); 01211 } 01212 if( is_null( $ret ) ) { 01213 return htmlspecialchars( $matches[0] ); 01214 } else { 01215 return $ret; 01216 } 01217 } 01218 01229 static function normalizeEntity( $name ) { 01230 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01231 return '&' . self::$htmlEntityAliases[$name] . ';'; 01232 } elseif ( in_array( $name, 01233 array( 'lt', 'gt', 'amp', 'quot' ) ) ) { 01234 return "&$name;"; 01235 } elseif ( isset( self::$htmlEntities[$name] ) ) { 01236 return '&#' . self::$htmlEntities[$name] . ';'; 01237 } else { 01238 return "&$name;"; 01239 } 01240 } 01241 01246 static function decCharReference( $codepoint ) { 01247 $point = intval( $codepoint ); 01248 if( Sanitizer::validateCodepoint( $point ) ) { 01249 return sprintf( '&#%d;', $point ); 01250 } else { 01251 return null; 01252 } 01253 } 01254 01259 static function hexCharReference( $codepoint ) { 01260 $point = hexdec( $codepoint ); 01261 if( Sanitizer::validateCodepoint( $point ) ) { 01262 return sprintf( '&#x%x;', $point ); 01263 } else { 01264 return null; 01265 } 01266 } 01267 01273 private static function validateCodepoint( $codepoint ) { 01274 return ($codepoint == 0x09) 01275 || ($codepoint == 0x0a) 01276 || ($codepoint == 0x0d) 01277 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) 01278 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) 01279 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); 01280 } 01281 01289 public static function decodeCharReferences( $text ) { 01290 return preg_replace_callback( 01291 self::CHAR_REFS_REGEX, 01292 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01293 $text ); 01294 } 01295 01306 public static function decodeCharReferencesAndNormalize( $text ) { 01307 global $wgContLang; 01308 $text = preg_replace_callback( 01309 self::CHAR_REFS_REGEX, 01310 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01311 $text, /* limit */ -1, $count ); 01312 01313 if ( $count ) { 01314 return $wgContLang->normalize( $text ); 01315 } else { 01316 return $text; 01317 } 01318 } 01319 01324 static function decodeCharReferencesCallback( $matches ) { 01325 if( $matches[1] != '' ) { 01326 return Sanitizer::decodeEntity( $matches[1] ); 01327 } elseif( $matches[2] != '' ) { 01328 return Sanitizer::decodeChar( intval( $matches[2] ) ); 01329 } elseif( $matches[3] != '' ) { 01330 return Sanitizer::decodeChar( hexdec( $matches[3] ) ); 01331 } 01332 # Last case should be an ampersand by itself 01333 return $matches[0]; 01334 } 01335 01343 static function decodeChar( $codepoint ) { 01344 if( Sanitizer::validateCodepoint( $codepoint ) ) { 01345 return codepointToUtf8( $codepoint ); 01346 } else { 01347 return UTF8_REPLACEMENT; 01348 } 01349 } 01350 01359 static function decodeEntity( $name ) { 01360 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01361 $name = self::$htmlEntityAliases[$name]; 01362 } 01363 if( isset( self::$htmlEntities[$name] ) ) { 01364 return codepointToUtf8( self::$htmlEntities[$name] ); 01365 } else { 01366 return "&$name;"; 01367 } 01368 } 01369 01376 static function attributeWhitelist( $element ) { 01377 static $list; 01378 if( !isset( $list ) ) { 01379 $list = Sanitizer::setupAttributeWhitelist(); 01380 } 01381 return isset( $list[$element] ) 01382 ? $list[$element] 01383 : array(); 01384 } 01385 01391 static function setupAttributeWhitelist() { 01392 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes; 01393 01394 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); 01395 01396 if ( $wgAllowRdfaAttributes ) { 01397 #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 01398 $common = array_merge( $common, array( 01399 'about', 'property', 'resource', 'datatype', 'typeof', 01400 ) ); 01401 } 01402 01403 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) { 01404 # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model 01405 $common = array_merge( $common, array( 01406 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype' 01407 ) ); 01408 } 01409 01410 $block = array_merge( $common, array( 'align' ) ); 01411 $tablealign = array( 'align', 'char', 'charoff', 'valign' ); 01412 $tablecell = array( 'abbr', 01413 'axis', 01414 'headers', 01415 'scope', 01416 'rowspan', 01417 'colspan', 01418 'nowrap', # deprecated 01419 'width', # deprecated 01420 'height', # deprecated 01421 'bgcolor' # deprecated 01422 ); 01423 01424 # Numbers refer to sections in HTML 4.01 standard describing the element. 01425 # See: http://www.w3.org/TR/html4/ 01426 $whitelist = array ( 01427 # 7.5.4 01428 'div' => $block, 01429 'center' => $common, # deprecated 01430 'span' => $block, # ?? 01431 01432 # 7.5.5 01433 'h1' => $block, 01434 'h2' => $block, 01435 'h3' => $block, 01436 'h4' => $block, 01437 'h5' => $block, 01438 'h6' => $block, 01439 01440 # 7.5.6 01441 # address 01442 01443 # 8.2.4 01444 # bdo 01445 01446 # 9.2.1 01447 'em' => $common, 01448 'strong' => $common, 01449 'cite' => $common, 01450 'dfn' => $common, 01451 'code' => $common, 01452 'samp' => $common, 01453 'kbd' => $common, 01454 'var' => $common, 01455 'abbr' => $common, 01456 # acronym 01457 01458 # 9.2.2 01459 'blockquote' => array_merge( $common, array( 'cite' ) ), 01460 # q 01461 01462 # 9.2.3 01463 'sub' => $common, 01464 'sup' => $common, 01465 01466 # 9.3.1 01467 'p' => $block, 01468 01469 # 9.3.2 01470 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), 01471 01472 # 9.3.4 01473 'pre' => array_merge( $common, array( 'width' ) ), 01474 01475 # 9.4 01476 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), 01477 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), 01478 01479 # 10.2 01480 'ul' => array_merge( $common, array( 'type' ) ), 01481 'ol' => array_merge( $common, array( 'type', 'start' ) ), 01482 'li' => array_merge( $common, array( 'type', 'value' ) ), 01483 01484 # 10.3 01485 'dl' => $common, 01486 'dd' => $common, 01487 'dt' => $common, 01488 01489 # 11.2.1 01490 'table' => array_merge( $common, 01491 array( 'summary', 'width', 'border', 'frame', 01492 'rules', 'cellspacing', 'cellpadding', 01493 'align', 'bgcolor', 01494 ) ), 01495 01496 # 11.2.2 01497 'caption' => array_merge( $common, array( 'align' ) ), 01498 01499 # 11.2.3 01500 'thead' => array_merge( $common, $tablealign ), 01501 'tfoot' => array_merge( $common, $tablealign ), 01502 'tbody' => array_merge( $common, $tablealign ), 01503 01504 # 11.2.4 01505 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), 01506 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), 01507 01508 # 11.2.5 01509 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), 01510 01511 # 11.2.6 01512 'td' => array_merge( $common, $tablecell, $tablealign ), 01513 'th' => array_merge( $common, $tablecell, $tablealign ), 01514 01515 # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object 01516 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa 01517 01518 # 13.2 01519 # Not usually allowed, but may be used for extension-style hooks 01520 # such as <math> when it is rasterized, or if $wgAllowImageTag is 01521 # true 01522 'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ), 01523 01524 # 15.2.1 01525 'tt' => $common, 01526 'b' => $common, 01527 'i' => $common, 01528 'big' => $common, 01529 'small' => $common, 01530 'strike' => $common, 01531 's' => $common, 01532 'u' => $common, 01533 01534 # 15.2.2 01535 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), 01536 # basefont 01537 01538 # 15.3 01539 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), 01540 01541 # XHTML Ruby annotation text module, simple ruby only. 01542 # http://www.w3c.org/TR/ruby/ 01543 'ruby' => $common, 01544 # rbc 01545 # rtc 01546 'rb' => $common, 01547 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), 01548 'rp' => $common, 01549 01550 # MathML root element, where used for extensions 01551 # 'title' may not be 100% valid here; it's XHTML 01552 # http://www.w3.org/TR/REC-MathML/ 01553 'math' => array( 'class', 'style', 'id', 'title' ), 01554 01555 # HTML 5 section 4.6 01556 'bdi' => $common, 01557 01558 ); 01559 return $whitelist; 01560 } 01561 01572 static function stripAllTags( $text ) { 01573 # Actual <tags> 01574 $text = StringUtils::delimiterReplace( '<', '>', '', $text ); 01575 01576 # Normalize &entities and whitespace 01577 $text = self::decodeCharReferences( $text ); 01578 $text = self::normalizeWhitespace( $text ); 01579 01580 return $text; 01581 } 01582 01592 static function hackDocType() { 01593 $out = "<!DOCTYPE html [\n"; 01594 foreach( self::$htmlEntities as $entity => $codepoint ) { 01595 $out .= "<!ENTITY $entity \"&#$codepoint;\">"; 01596 } 01597 $out .= "]>\n"; 01598 return $out; 01599 } 01600 01605 static function cleanUrl( $url ) { 01606 # Normalize any HTML entities in input. They will be 01607 # re-escaped by makeExternalLink(). 01608 $url = Sanitizer::decodeCharReferences( $url ); 01609 01610 # Escape any control characters introduced by the above step 01611 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', 01612 array( __CLASS__, 'cleanUrlCallback' ), $url ); 01613 01614 # Validate hostname portion 01615 $matches = array(); 01616 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { 01617 list( /* $whole */, $protocol, $host, $rest ) = $matches; 01618 01619 // Characters that will be ignored in IDNs. 01620 // http://tools.ietf.org/html/3454#section-3.1 01621 // Strip them before further processing so blacklists and such work. 01622 $strip = "/ 01623 \\s| # general whitespace 01624 \xc2\xad| # 00ad SOFT HYPHEN 01625 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 01626 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 01627 \xe2\x81\xa0| # 2060 WORD JOINER 01628 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 01629 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER 01630 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 01631 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 01632 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 01633 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 01634 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 01635 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16 01636 /xuD"; 01637 01638 $host = preg_replace( $strip, '', $host ); 01639 01640 // @todo FIXME: Validate hostnames here 01641 01642 return $protocol . $host . $rest; 01643 } else { 01644 return $url; 01645 } 01646 } 01647 01652 static function cleanUrlCallback( $matches ) { 01653 return urlencode( $matches[0] ); 01654 } 01655 01684 public static function validateEmail( $addr ) { 01685 $result = null; 01686 if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) { 01687 return $result; 01688 } 01689 01690 // Please note strings below are enclosed in brackets [], this make the 01691 // hyphen "-" a range indicator. Hence it is double backslashed below. 01692 // See bug 26948 01693 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ; 01694 $rfc1034_ldh_str = "a-z0-9\\-" ; 01695 01696 $HTML5_email_regexp = "/ 01697 ^ # start of string 01698 [$rfc5322_atext\\.]+ # user part which is liberal :p 01699 @ # 'apostrophe' 01700 [$rfc1034_ldh_str]+ # First domain part 01701 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot 01702 $ # End of string 01703 /ix" ; // case Insensitive, eXtended 01704 01705 return (bool) preg_match( $HTML5_email_regexp, $addr ); 01706 } 01707 }