MediaWiki  REL1_20
Sanitizer.php
Go to the documentation of this file.
00001 <?php
00031 class Sanitizer {
00036         const CHAR_REFS_REGEX =
00037                 '/&([A-Za-z0-9\x80-\xff]+);
00038                  |&\#([0-9]+);
00039                  |&\#[xX]([0-9A-Fa-f]+);
00040                  |(&)/x';
00041 
00050         const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
00051         const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
00052 
00059         static $htmlEntities = array(
00060                 'Aacute'   => 193,
00061                 'aacute'   => 225,
00062                 'Acirc'    => 194,
00063                 'acirc'    => 226,
00064                 'acute'    => 180,
00065                 'AElig'    => 198,
00066                 'aelig'    => 230,
00067                 'Agrave'   => 192,
00068                 'agrave'   => 224,
00069                 'alefsym'  => 8501,
00070                 'Alpha'    => 913,
00071                 'alpha'    => 945,
00072                 'amp'      => 38,
00073                 'and'      => 8743,
00074                 'ang'      => 8736,
00075                 'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
00076                 'Aring'    => 197,
00077                 'aring'    => 229,
00078                 'asymp'    => 8776,
00079                 'Atilde'   => 195,
00080                 'atilde'   => 227,
00081                 'Auml'     => 196,
00082                 'auml'     => 228,
00083                 'bdquo'    => 8222,
00084                 'Beta'     => 914,
00085                 'beta'     => 946,
00086                 'brvbar'   => 166,
00087                 'bull'     => 8226,
00088                 'cap'      => 8745,
00089                 'Ccedil'   => 199,
00090                 'ccedil'   => 231,
00091                 'cedil'    => 184,
00092                 'cent'     => 162,
00093                 'Chi'      => 935,
00094                 'chi'      => 967,
00095                 'circ'     => 710,
00096                 'clubs'    => 9827,
00097                 'cong'     => 8773,
00098                 'copy'     => 169,
00099                 'crarr'    => 8629,
00100                 'cup'      => 8746,
00101                 'curren'   => 164,
00102                 'dagger'   => 8224,
00103                 'Dagger'   => 8225,
00104                 'darr'     => 8595,
00105                 'dArr'     => 8659,
00106                 'deg'      => 176,
00107                 'Delta'    => 916,
00108                 'delta'    => 948,
00109                 'diams'    => 9830,
00110                 'divide'   => 247,
00111                 'Eacute'   => 201,
00112                 'eacute'   => 233,
00113                 'Ecirc'    => 202,
00114                 'ecirc'    => 234,
00115                 'Egrave'   => 200,
00116                 'egrave'   => 232,
00117                 'empty'    => 8709,
00118                 'emsp'     => 8195,
00119                 'ensp'     => 8194,
00120                 'Epsilon'  => 917,
00121                 'epsilon'  => 949,
00122                 'equiv'    => 8801,
00123                 'Eta'      => 919,
00124                 'eta'      => 951,
00125                 'ETH'      => 208,
00126                 'eth'      => 240,
00127                 'Euml'     => 203,
00128                 'euml'     => 235,
00129                 'euro'     => 8364,
00130                 'exist'    => 8707,
00131                 'fnof'     => 402,
00132                 'forall'   => 8704,
00133                 'frac12'   => 189,
00134                 'frac14'   => 188,
00135                 'frac34'   => 190,
00136                 'frasl'    => 8260,
00137                 'Gamma'    => 915,
00138                 'gamma'    => 947,
00139                 'ge'       => 8805,
00140                 'gt'       => 62,
00141                 'harr'     => 8596,
00142                 'hArr'     => 8660,
00143                 'hearts'   => 9829,
00144                 'hellip'   => 8230,
00145                 'Iacute'   => 205,
00146                 'iacute'   => 237,
00147                 'Icirc'    => 206,
00148                 'icirc'    => 238,
00149                 'iexcl'    => 161,
00150                 'Igrave'   => 204,
00151                 'igrave'   => 236,
00152                 'image'    => 8465,
00153                 'infin'    => 8734,
00154                 'int'      => 8747,
00155                 'Iota'     => 921,
00156                 'iota'     => 953,
00157                 'iquest'   => 191,
00158                 'isin'     => 8712,
00159                 'Iuml'     => 207,
00160                 'iuml'     => 239,
00161                 'Kappa'    => 922,
00162                 'kappa'    => 954,
00163                 'Lambda'   => 923,
00164                 'lambda'   => 955,
00165                 'lang'     => 9001,
00166                 'laquo'    => 171,
00167                 'larr'     => 8592,
00168                 'lArr'     => 8656,
00169                 'lceil'    => 8968,
00170                 'ldquo'    => 8220,
00171                 'le'       => 8804,
00172                 'lfloor'   => 8970,
00173                 'lowast'   => 8727,
00174                 'loz'      => 9674,
00175                 'lrm'      => 8206,
00176                 'lsaquo'   => 8249,
00177                 'lsquo'    => 8216,
00178                 'lt'       => 60,
00179                 'macr'     => 175,
00180                 'mdash'    => 8212,
00181                 'micro'    => 181,
00182                 'middot'   => 183,
00183                 'minus'    => 8722,
00184                 'Mu'       => 924,
00185                 'mu'       => 956,
00186                 'nabla'    => 8711,
00187                 'nbsp'     => 160,
00188                 'ndash'    => 8211,
00189                 'ne'       => 8800,
00190                 'ni'       => 8715,
00191                 'not'      => 172,
00192                 'notin'    => 8713,
00193                 'nsub'     => 8836,
00194                 'Ntilde'   => 209,
00195                 'ntilde'   => 241,
00196                 'Nu'       => 925,
00197                 'nu'       => 957,
00198                 'Oacute'   => 211,
00199                 'oacute'   => 243,
00200                 'Ocirc'    => 212,
00201                 'ocirc'    => 244,
00202                 'OElig'    => 338,
00203                 'oelig'    => 339,
00204                 'Ograve'   => 210,
00205                 'ograve'   => 242,
00206                 'oline'    => 8254,
00207                 'Omega'    => 937,
00208                 'omega'    => 969,
00209                 'Omicron'  => 927,
00210                 'omicron'  => 959,
00211                 'oplus'    => 8853,
00212                 'or'       => 8744,
00213                 'ordf'     => 170,
00214                 'ordm'     => 186,
00215                 'Oslash'   => 216,
00216                 'oslash'   => 248,
00217                 'Otilde'   => 213,
00218                 'otilde'   => 245,
00219                 'otimes'   => 8855,
00220                 'Ouml'     => 214,
00221                 'ouml'     => 246,
00222                 'para'     => 182,
00223                 'part'     => 8706,
00224                 'permil'   => 8240,
00225                 'perp'     => 8869,
00226                 'Phi'      => 934,
00227                 'phi'      => 966,
00228                 'Pi'       => 928,
00229                 'pi'       => 960,
00230                 'piv'      => 982,
00231                 'plusmn'   => 177,
00232                 'pound'    => 163,
00233                 'prime'    => 8242,
00234                 'Prime'    => 8243,
00235                 'prod'     => 8719,
00236                 'prop'     => 8733,
00237                 'Psi'      => 936,
00238                 'psi'      => 968,
00239                 'quot'     => 34,
00240                 'radic'    => 8730,
00241                 'rang'     => 9002,
00242                 'raquo'    => 187,
00243                 'rarr'     => 8594,
00244                 'rArr'     => 8658,
00245                 'rceil'    => 8969,
00246                 'rdquo'    => 8221,
00247                 'real'     => 8476,
00248                 'reg'      => 174,
00249                 'rfloor'   => 8971,
00250                 'Rho'      => 929,
00251                 'rho'      => 961,
00252                 'rlm'      => 8207,
00253                 'rsaquo'   => 8250,
00254                 'rsquo'    => 8217,
00255                 'sbquo'    => 8218,
00256                 'Scaron'   => 352,
00257                 'scaron'   => 353,
00258                 'sdot'     => 8901,
00259                 'sect'     => 167,
00260                 'shy'      => 173,
00261                 'Sigma'    => 931,
00262                 'sigma'    => 963,
00263                 'sigmaf'   => 962,
00264                 'sim'      => 8764,
00265                 'spades'   => 9824,
00266                 'sub'      => 8834,
00267                 'sube'     => 8838,
00268                 'sum'      => 8721,
00269                 'sup'      => 8835,
00270                 'sup1'     => 185,
00271                 'sup2'     => 178,
00272                 'sup3'     => 179,
00273                 'supe'     => 8839,
00274                 'szlig'    => 223,
00275                 'Tau'      => 932,
00276                 'tau'      => 964,
00277                 'there4'   => 8756,
00278                 'Theta'    => 920,
00279                 'theta'    => 952,
00280                 'thetasym' => 977,
00281                 'thinsp'   => 8201,
00282                 'THORN'    => 222,
00283                 'thorn'    => 254,
00284                 'tilde'    => 732,
00285                 'times'    => 215,
00286                 'trade'    => 8482,
00287                 'Uacute'   => 218,
00288                 'uacute'   => 250,
00289                 'uarr'     => 8593,
00290                 'uArr'     => 8657,
00291                 'Ucirc'    => 219,
00292                 'ucirc'    => 251,
00293                 'Ugrave'   => 217,
00294                 'ugrave'   => 249,
00295                 'uml'      => 168,
00296                 'upsih'    => 978,
00297                 'Upsilon'  => 933,
00298                 'upsilon'  => 965,
00299                 'Uuml'     => 220,
00300                 'uuml'     => 252,
00301                 'weierp'   => 8472,
00302                 'Xi'       => 926,
00303                 'xi'       => 958,
00304                 'Yacute'   => 221,
00305                 'yacute'   => 253,
00306                 'yen'      => 165,
00307                 'Yuml'     => 376,
00308                 'yuml'     => 255,
00309                 'Zeta'     => 918,
00310                 'zeta'     => 950,
00311                 'zwj'      => 8205,
00312                 'zwnj'     => 8204
00313         );
00314 
00318         static $htmlEntityAliases = array(
00319                 'רלמ' => 'rlm',
00320                 'رلم' => 'rlm',
00321         );
00322 
00326         static $attribsRegex;
00327 
00333         static function getAttribsRegex() {
00334                 if ( self::$attribsRegex === null ) {
00335                         $attribFirst = '[:A-Z_a-z0-9]';
00336                         $attrib = '[:A-Z_a-z-.0-9]';
00337                         $space = '[\x09\x0a\x0d\x20]';
00338                         self::$attribsRegex =
00339                                 "/(?:^|$space)({$attribFirst}{$attrib}*)
00340                                   ($space*=$space*
00341                                         (?:
00342                                          # The attribute value: quoted or alone
00343                                           \"([^<\"]*)\"
00344                                          | '([^<']*)'
00345                                          |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00346                                          |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00347                                                                                  # colors are specified like this.
00348                                                                                  # We'll be normalizing it.
00349                                         )
00350                                 )?(?=$space|\$)/sx";
00351                 }
00352                 return self::$attribsRegex;
00353         }
00354 
00366         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
00367                 global $wgUseTidy;
00368 
00369                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00370                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
00371 
00372                 wfProfileIn( __METHOD__ );
00373 
00374                 if ( !$staticInitialised ) {
00375 
00376                         $htmlpairsStatic = array( # Tags that must be closed
00377                                 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00378                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00379                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
00380                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00381                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
00382                                 'kbd', 'samp'
00383                         );
00384                         $htmlsingle = array(
00385                                 'br', 'hr', 'li', 'dt', 'dd'
00386                         );
00387                         $htmlsingleonly = array( # Elements that cannot have close tags
00388                                 'br', 'hr'
00389                         );
00390                         $htmlnest = array( # Tags that can be nested--??
00391                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00392                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
00393                         );
00394                         $tabletags = array( # Can only appear inside table, we will close them
00395                                 'td', 'th', 'tr',
00396                         );
00397                         $htmllist = array( # Tags used by list
00398                                 'ul','ol',
00399                         );
00400                         $listtags = array( # Tags that can appear in a list
00401                                 'li',
00402                         );
00403 
00404                         global $wgAllowImageTag;
00405                         if ( $wgAllowImageTag ) {
00406                                 $htmlsingle[] = 'img';
00407                                 $htmlsingleonly[] = 'img';
00408                         }
00409 
00410                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
00411                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
00412 
00413                         # Convert them all to hashtables for faster lookup
00414                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00415                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
00416                         foreach ( $vars as $var ) {
00417                                 $$var = array_flip( $$var );
00418                         }
00419                         $staticInitialised = true;
00420                 }
00421                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
00422                 $extratags = array_flip( $extratags );
00423                 $removetags = array_flip( $removetags );
00424                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
00425                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
00426 
00427                 # Remove HTML comments
00428                 $text = Sanitizer::removeHTMLcomments( $text );
00429                 $bits = explode( '<', $text );
00430                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
00431                 if ( !$wgUseTidy ) {
00432                         $tagstack = $tablestack = array();
00433                         foreach ( $bits as $x ) {
00434                                 $regs = array();
00435                                 # $slash: Does the current element start with a '/'?
00436                                 # $t: Current element name
00437                                 # $params: String between element name and >
00438                                 # $brace: Ending '>' or '/>'
00439                                 # $rest: Everything until the next element of $bits
00440                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00441                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00442                                 } else {
00443                                         $slash = $t = $params = $brace = $rest = null;
00444                                 }
00445 
00446                                 $badtag = false;
00447                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00448                                         # Check our stack
00449                                         if ( $slash && isset( $htmlsingleonly[$t] ) ) {
00450                                                 $badtag = true;
00451                                         } elseif ( $slash ) {
00452                                                 # Closing a tag... is it the one we just opened?
00453                                                 $ot = @array_pop( $tagstack );
00454                                                 if ( $ot != $t ) {
00455                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
00456                                                                 # Pop all elements with an optional close tag
00457                                                                 # and see if we find a match below them
00458                                                                 $optstack = array();
00459                                                                 array_push( $optstack, $ot );
00460                                                                 wfSuppressWarnings();
00461                                                                 $ot = array_pop( $tagstack );
00462                                                                 wfRestoreWarnings();
00463                                                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
00464                                                                         array_push( $optstack, $ot );
00465                                                                         wfSuppressWarnings();
00466                                                                         $ot = array_pop( $tagstack );
00467                                                                         wfRestoreWarnings();
00468                                                                 }
00469                                                                 if ( $t != $ot ) {
00470                                                                         # No match. Push the optional elements back again
00471                                                                         $badtag = true;
00472                                                                         wfSuppressWarnings();
00473                                                                         $ot = array_pop( $optstack );
00474                                                                         wfRestoreWarnings();
00475                                                                         while ( $ot ) {
00476                                                                                 array_push( $tagstack, $ot );
00477                                                                                 wfSuppressWarnings();
00478                                                                                 $ot = array_pop( $optstack );
00479                                                                                 wfRestoreWarnings();
00480                                                                         }
00481                                                                 }
00482                                                         } else {
00483                                                                 @array_push( $tagstack, $ot );
00484                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
00485                                                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
00486                                                                         $badtag = true;
00487                                                                 }
00488                                                         }
00489                                                 } else {
00490                                                         if ( $t == 'table' ) {
00491                                                                 $tagstack = array_pop( $tablestack );
00492                                                         }
00493                                                 }
00494                                                 $newparams = '';
00495                                         } else {
00496                                                 # Keep track for later
00497                                                 if ( isset( $tabletags[$t] ) &&
00498                                                 !in_array( 'table', $tagstack ) ) {
00499                                                         $badtag = true;
00500                                                 } elseif ( in_array( $t, $tagstack ) &&
00501                                                 !isset( $htmlnest [$t ] ) ) {
00502                                                         $badtag = true;
00503                                                 # Is it a self closed htmlpair ? (bug 5487)
00504                                                 } elseif ( $brace == '/>' &&
00505                                                 isset( $htmlpairs[$t] ) ) {
00506                                                         $badtag = true;
00507                                                 } elseif ( isset( $htmlsingleonly[$t] ) ) {
00508                                                         # Hack to force empty tag for uncloseable elements
00509                                                         $brace = '/>';
00510                                                 } elseif ( isset( $htmlsingle[$t] ) ) {
00511                                                         # Hack to not close $htmlsingle tags
00512                                                         $brace = null;
00513                                                 } elseif ( isset( $tabletags[$t] )
00514                                                 && in_array( $t, $tagstack ) ) {
00515                                                         // New table tag but forgot to close the previous one
00516                                                         $text .= "</$t>";
00517                                                 } else {
00518                                                         if ( $t == 'table' ) {
00519                                                                 array_push( $tablestack, $tagstack );
00520                                                                 $tagstack = array();
00521                                                         }
00522                                                         array_push( $tagstack, $t );
00523                                                 }
00524 
00525                                                 # Replace any variables or template parameters with
00526                                                 # plaintext results.
00527                                                 if( is_callable( $processCallback ) ) {
00528                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
00529                                                 }
00530 
00531                                                 # Strip non-approved attributes from the tag
00532                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00533                                         }
00534                                         if ( !$badtag ) {
00535                                                 $rest = str_replace( '>', '&gt;', $rest );
00536                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00537                                                 $text .= "<$slash$t$newparams$close>$rest";
00538                                                 continue;
00539                                         }
00540                                 }
00541                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00542                         }
00543                         # Close off any remaining tags
00544                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
00545                                 $text .= "</$t>\n";
00546                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
00547                         }
00548                 } else {
00549                         # this might be possible using tidy itself
00550                         foreach ( $bits as $x ) {
00551                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00552                                 $x, $regs );
00553                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00554                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00555                                         if( is_callable( $processCallback ) ) {
00556                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
00557                                         }
00558                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
00559                                         $rest = str_replace( '>', '&gt;', $rest );
00560                                         $text .= "<$slash$t$newparams$brace$rest";
00561                                 } else {
00562                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00563                                 }
00564                         }
00565                 }
00566                 wfProfileOut( __METHOD__ );
00567                 return $text;
00568         }
00569 
00580         static function removeHTMLcomments( $text ) {
00581                 wfProfileIn( __METHOD__ );
00582                 while (($start = strpos($text, '<!--')) !== false) {
00583                         $end = strpos($text, '-->', $start + 4);
00584                         if ($end === false) {
00585                                 # Unterminated comment; bail out
00586                                 break;
00587                         }
00588 
00589                         $end += 3;
00590 
00591                         # Trim space and newline if the comment is both
00592                         # preceded and followed by a newline
00593                         $spaceStart = max($start - 1, 0);
00594                         $spaceLen = $end - $spaceStart;
00595                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
00596                                 $spaceStart--;
00597                                 $spaceLen++;
00598                         }
00599                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
00600                                 $spaceLen++;
00601                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
00602                                 # Remove the comment, leading and trailing
00603                                 # spaces, and leave only one newline.
00604                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
00605                         }
00606                         else {
00607                                 # Remove just the comment.
00608                                 $text = substr_replace($text, '', $start, $end - $start);
00609                         }
00610                 }
00611                 wfProfileOut( __METHOD__ );
00612                 return $text;
00613         }
00614 
00630         static function validateTagAttributes( $attribs, $element ) {
00631                 return Sanitizer::validateAttributes( $attribs,
00632                         Sanitizer::attributeWhitelist( $element ) );
00633         }
00634 
00650         static function validateAttributes( $attribs, $whitelist ) {
00651                 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
00652 
00653                 $whitelist = array_flip( $whitelist );
00654                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
00655 
00656                 $out = array();
00657                 foreach( $attribs as $attribute => $value ) {
00658                         #allow XML namespace declaration if RDFa is enabled
00659                         if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
00660                                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00661                                         $out[$attribute] = $value;
00662                                 }
00663 
00664                                 continue;
00665                         }
00666 
00667                         # Allow any attribute beginning with "data-", if in HTML5 mode
00668                         if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
00669                                 continue;
00670                         }
00671 
00672                         # Strip javascript "expression" from stylesheets.
00673                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00674                         if( $attribute == 'style' ) {
00675                                 $value = Sanitizer::checkCss( $value );
00676                         }
00677 
00678                         if ( $attribute === 'id' ) {
00679                                 $value = Sanitizer::escapeId( $value, 'noninitial' );
00680                         }
00681 
00682                         //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
00683                         if ( $attribute === 'rel' || $attribute === 'rev' ||
00684                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
00685                                 $attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
00686                                 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
00687                                 $attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
00688 
00689                                 //Paranoia. Allow "simple" values but suppress javascript
00690                                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00691                                         continue;
00692                                 }
00693                         }
00694 
00695                         # NOTE: even though elements using href/src are not allowed directly, supply
00696                         #       validation code that can be used by tag hook handlers, etc
00697                         if ( $attribute === 'href' || $attribute === 'src' ) {
00698                                 if ( !preg_match( $hrefExp, $value ) ) {
00699                                         continue; //drop any href or src attributes not using an allowed protocol.
00700                                                   //NOTE: this also drops all relative URLs
00701                                 }
00702                         }
00703 
00704                         // If this attribute was previously set, override it.
00705                         // Output should only have one attribute of each name.
00706                         $out[$attribute] = $value;
00707                 }
00708 
00709                 if ( $wgAllowMicrodataAttributes ) {
00710                         # itemtype, itemid, itemref don't make sense without itemscope
00711                         if ( !array_key_exists( 'itemscope', $out ) ) {
00712                                 unset( $out['itemtype'] );
00713                                 unset( $out['itemid'] );
00714                                 unset( $out['itemref'] );
00715                         }
00716                         # TODO: Strip itemprop if we aren't descendants of an itemscope.
00717                 }
00718                 return $out;
00719         }
00720 
00731         static function mergeAttributes( $a, $b ) {
00732                 $out = array_merge( $a, $b );
00733                 if( isset( $a['class'] ) && isset( $b['class'] )
00734                 && is_string( $a['class'] ) && is_string( $b['class'] )
00735                 && $a['class'] !== $b['class'] ) {
00736                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00737                                 -1, PREG_SPLIT_NO_EMPTY );
00738                         $out['class'] = implode( ' ', array_unique( $classes ) );
00739                 }
00740                 return $out;
00741         }
00742 
00760         static function checkCss( $value ) {
00761                 // Decode character references like &#123;
00762                 $value = Sanitizer::decodeCharReferences( $value );
00763 
00764                 // Decode escape sequences and line continuation
00765                 // See the grammar in the CSS 2 spec, appendix D.
00766                 // This has to be done AFTER decoding character references.
00767                 // This means it isn't possible for this function to return
00768                 // unsanitized escape sequences. It is possible to manufacture
00769                 // input that contains character references that decode to
00770                 // escape sequences that decode to character references, but
00771                 // it's OK for the return value to contain character references
00772                 // because the caller is supposed to escape those anyway.
00773                 static $decodeRegex;
00774                 if ( !$decodeRegex ) {
00775                         $space = '[\\x20\\t\\r\\n\\f]';
00776                         $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00777                         $backslash = '\\\\';
00778                         $decodeRegex = "/ $backslash
00779                                 (?:
00780                                         ($nl) |  # 1. Line continuation
00781                                         ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
00782                                         (.) | # 3. backslash cancelling special meaning
00783                                         () | # 4. backslash at end of string
00784                                 )/xu";
00785                 }
00786                 $value = preg_replace_callback( $decodeRegex,
00787                         array( __CLASS__, 'cssDecodeCallback' ), $value );
00788 
00789                 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
00790                 $value = preg_replace_callback(
00791                         '/[!-z]/u', // U+FF01 to U+FF5A
00792                         function ( $matches ) {
00793                                 $cp = utf8ToCodepoint( $matches[0] );
00794                                 if ( $cp === false ) {
00795                                         return '';
00796                                 }
00797                                 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
00798                         },
00799                         $value
00800                 );
00801 
00802                 // Convert more characters IE6 might treat as ascii
00803                 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
00804                 $value = str_replace(
00805                         array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
00806                         array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
00807                         $value
00808                 );
00809 
00810                 // Remove any comments; IE gets token splitting wrong
00811                 // This must be done AFTER decoding character references and
00812                 // escape sequences, because those steps can introduce comments
00813                 // This step cannot introduce character references or escape
00814                 // sequences, because it replaces comments with spaces rather
00815                 // than removing them completely.
00816                 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00817 
00818                 // Remove anything after a comment-start token, to guard against
00819                 // incorrect client implementations.
00820                 $commentPos = strpos( $value, '/*' );
00821                 if ( $commentPos !== false ) {
00822                         $value = substr( $value, 0, $commentPos );
00823                 }
00824 
00825                 // S followed by repeat, iteration, or prolonged sound marks,
00826                 // which IE will treat as "ss"
00827                 $value = preg_replace(
00828                         '/s(?:
00829                                 \xE3\x80\xB1 | # U+3031
00830                                 \xE3\x82\x9D | # U+309D
00831                                 \xE3\x83\xBC | # U+30FC
00832                                 \xE3\x83\xBD | # U+30FD
00833                                 \xEF\xB9\xBC | # U+FE7C
00834                                 \xEF\xB9\xBD | # U+FE7D
00835                                 \xEF\xBD\xB0   # U+FF70
00836                         )/ix',
00837                         'ss',
00838                         $value
00839                 );
00840 
00841                 // Reject problematic keywords and control characters
00842                 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
00843                         return '/* invalid control char */';
00844                 } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( !ix', $value ) ) {
00845                         return '/* insecure input */';
00846                 }
00847                 return $value;
00848         }
00849 
00854         static function cssDecodeCallback( $matches ) {
00855                 if ( $matches[1] !== '' ) {
00856                         // Line continuation
00857                         return '';
00858                 } elseif ( $matches[2] !== '' ) {
00859                         $char = codepointToUtf8( hexdec( $matches[2] ) );
00860                 } elseif ( $matches[3] !== '' ) {
00861                         $char = $matches[3];
00862                 } else {
00863                         $char = '\\';
00864                 }
00865                 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00866                         // These characters need to be escaped in strings
00867                         // Clean up the escape sequence to avoid parsing errors by clients
00868                         return '\\' . dechex( ord( $char ) ) . ' ';
00869                 } else {
00870                         // Decode unnecessary escape
00871                         return $char;
00872                 }
00873         }
00874 
00894         static function fixTagAttributes( $text, $element ) {
00895                 if( trim( $text ) == '' ) {
00896                         return '';
00897                 }
00898 
00899                 $decoded = Sanitizer::decodeTagAttributes( $text );
00900                 $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
00901 
00902                 $attribs = array();
00903                 foreach( $stripped as $attribute => $value ) {
00904                         $encAttribute = htmlspecialchars( $attribute );
00905                         $encValue = Sanitizer::safeEncodeAttribute( $value );
00906 
00907                         $attribs[] = "$encAttribute=\"$encValue\"";
00908                 }
00909                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
00910         }
00911 
00917         static function encodeAttribute( $text ) {
00918                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
00919 
00920                 // Whitespace is normalized during attribute decoding,
00921                 // so if we've been passed non-spaces we must encode them
00922                 // ahead of time or they won't be preserved.
00923                 $encValue = strtr( $encValue, array(
00924                         "\n" => '&#10;',
00925                         "\r" => '&#13;',
00926                         "\t" => '&#9;',
00927                 ) );
00928 
00929                 return $encValue;
00930         }
00931 
00938         static function safeEncodeAttribute( $text ) {
00939                 $encValue = Sanitizer::encodeAttribute( $text );
00940 
00941                 # Templates and links may be expanded in later parsing,
00942                 # creating invalid or dangerous output. Suppress this.
00943                 $encValue = strtr( $encValue, array(
00944                         '<'    => '&lt;',   // This should never happen,
00945                         '>'    => '&gt;',   // we've received invalid input
00946                         '"'    => '&quot;', // which should have been escaped.
00947                         '{'    => '&#123;',
00948                         '['    => '&#91;',
00949                         "''"   => '&#39;&#39;',
00950                         'ISBN' => '&#73;SBN',
00951                         'RFC'  => '&#82;FC',
00952                         'PMID' => '&#80;MID',
00953                         '|'    => '&#124;',
00954                         '__'   => '&#95;_',
00955                 ) );
00956 
00957                 # Stupid hack
00958                 $encValue = preg_replace_callback(
00959                         '/((?i)' . wfUrlProtocols() . ')/',
00960                         array( 'Sanitizer', 'armorLinksCallback' ),
00961                         $encValue );
00962                 return $encValue;
00963         }
00964 
00996         static function escapeId( $id, $options = array() ) {
00997                 global $wgHtml5, $wgExperimentalHtmlIds;
00998                 $options = (array)$options;
00999 
01000                 if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
01001                         $id = Sanitizer::decodeCharReferences( $id );
01002                         $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
01003                         $id = trim( $id, '_' );
01004                         if ( $id === '' ) {
01005                                 # Must have been all whitespace to start with.
01006                                 return '_';
01007                         } else {
01008                                 return $id;
01009                         }
01010                 }
01011 
01012                 # HTML4-style escaping
01013                 static $replace = array(
01014                         '%3A' => ':',
01015                         '%' => '.'
01016                 );
01017 
01018                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
01019                 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
01020 
01021                 if ( !preg_match( '/^[a-zA-Z]/', $id )
01022                 && !in_array( 'noninitial', $options ) )  {
01023                         // Initial character must be a letter!
01024                         $id = "x$id";
01025                 }
01026                 return $id;
01027         }
01028 
01040         static function escapeClass( $class ) {
01041                 // Convert ugly stuff to underscores and kill underscores in ugly places
01042                 return rtrim(preg_replace(
01043                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
01044                         '_',
01045                         $class ), '_');
01046         }
01047 
01055         static function escapeHtmlAllowEntities( $html ) {
01056                 $html = Sanitizer::decodeCharReferences( $html );
01057                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
01058                 # hurt.
01059                 $html = htmlspecialchars( $html, ENT_QUOTES );
01060                 return $html;
01061         }
01062 
01068         private static function armorLinksCallback( $matches ) {
01069                 return str_replace( ':', '&#58;', $matches[1] );
01070         }
01071 
01080         public static function decodeTagAttributes( $text ) {
01081                 if( trim( $text ) == '' ) {
01082                         return array();
01083                 }
01084 
01085                 $attribs = array();
01086                 $pairs = array();
01087                 if( !preg_match_all(
01088                         self::getAttribsRegex(),
01089                         $text,
01090                         $pairs,
01091                         PREG_SET_ORDER ) ) {
01092                         return $attribs;
01093                 }
01094 
01095                 foreach( $pairs as $set ) {
01096                         $attribute = strtolower( $set[1] );
01097                         $value = Sanitizer::getTagAttributeCallback( $set );
01098 
01099                         // Normalize whitespace
01100                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
01101                         $value = trim( $value );
01102 
01103                         // Decode character references
01104                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
01105                 }
01106                 return $attribs;
01107         }
01108 
01116         private static function getTagAttributeCallback( $set ) {
01117                 if( isset( $set[6] ) ) {
01118                         # Illegal #XXXXXX color with no quotes.
01119                         return $set[6];
01120                 } elseif( isset( $set[5] ) ) {
01121                         # No quotes.
01122                         return $set[5];
01123                 } elseif( isset( $set[4] ) ) {
01124                         # Single-quoted
01125                         return $set[4];
01126                 } elseif( isset( $set[3] ) ) {
01127                         # Double-quoted
01128                         return $set[3];
01129                 } elseif( !isset( $set[2] ) ) {
01130                         # In XHTML, attributes must have a value.
01131                         # For 'reduced' form, return explicitly the attribute name here.
01132                         return $set[1];
01133                 } else {
01134                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
01135                 }
01136         }
01137 
01149         private static function normalizeAttributeValue( $text ) {
01150                 return str_replace( '"', '&quot;',
01151                         self::normalizeWhitespace(
01152                                 Sanitizer::normalizeCharReferences( $text ) ) );
01153         }
01154 
01159         private static function normalizeWhitespace( $text ) {
01160                 return preg_replace(
01161                         '/\r\n|[\x20\x0d\x0a\x09]/',
01162                         ' ',
01163                         $text );
01164         }
01165 
01174         static function normalizeSectionNameWhitespace( $section ) {
01175                 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
01176         }
01177 
01193         static function normalizeCharReferences( $text ) {
01194                 return preg_replace_callback(
01195                         self::CHAR_REFS_REGEX,
01196                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01197                         $text );
01198         }
01203         static function normalizeCharReferencesCallback( $matches ) {
01204                 $ret = null;
01205                 if( $matches[1] != '' ) {
01206                         $ret = Sanitizer::normalizeEntity( $matches[1] );
01207                 } elseif( $matches[2] != '' ) {
01208                         $ret = Sanitizer::decCharReference( $matches[2] );
01209                 } elseif( $matches[3] != ''  ) {
01210                         $ret = Sanitizer::hexCharReference( $matches[3] );
01211                 }
01212                 if( is_null( $ret ) ) {
01213                         return htmlspecialchars( $matches[0] );
01214                 } else {
01215                         return $ret;
01216                 }
01217         }
01218 
01229         static function normalizeEntity( $name ) {
01230                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
01231                         return '&' . self::$htmlEntityAliases[$name] . ';';
01232                 } elseif ( in_array( $name,
01233                 array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
01234                         return "&$name;";
01235                 } elseif ( isset( self::$htmlEntities[$name] ) ) {
01236                         return '&#' . self::$htmlEntities[$name] . ';';
01237                 } else {
01238                         return "&amp;$name;";
01239                 }
01240         }
01241 
01246         static function decCharReference( $codepoint ) {
01247                 $point = intval( $codepoint );
01248                 if( Sanitizer::validateCodepoint( $point ) ) {
01249                         return sprintf( '&#%d;', $point );
01250                 } else {
01251                         return null;
01252                 }
01253         }
01254 
01259         static function hexCharReference( $codepoint ) {
01260                 $point = hexdec( $codepoint );
01261                 if( Sanitizer::validateCodepoint( $point ) ) {
01262                         return sprintf( '&#x%x;', $point );
01263                 } else {
01264                         return null;
01265                 }
01266         }
01267 
01273         private static function validateCodepoint( $codepoint ) {
01274                 return ($codepoint ==    0x09)
01275                         || ($codepoint ==    0x0a)
01276                         || ($codepoint ==    0x0d)
01277                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
01278                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
01279                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
01280         }
01281 
01289         public static function decodeCharReferences( $text ) {
01290                 return preg_replace_callback(
01291                         self::CHAR_REFS_REGEX,
01292                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01293                         $text );
01294         }
01295 
01306         public static function decodeCharReferencesAndNormalize( $text ) {
01307                 global $wgContLang;
01308                 $text = preg_replace_callback(
01309                         self::CHAR_REFS_REGEX,
01310                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01311                         $text, /* limit */ -1, $count );
01312 
01313                 if ( $count ) {
01314                         return $wgContLang->normalize( $text );
01315                 } else {
01316                         return $text;
01317                 }
01318         }
01319 
01324         static function decodeCharReferencesCallback( $matches ) {
01325                 if( $matches[1] != '' ) {
01326                         return Sanitizer::decodeEntity( $matches[1] );
01327                 } elseif( $matches[2] != '' ) {
01328                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
01329                 } elseif( $matches[3] != ''  ) {
01330                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
01331                 }
01332                 # Last case should be an ampersand by itself
01333                 return $matches[0];
01334         }
01335 
01343         static function decodeChar( $codepoint ) {
01344                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
01345                         return codepointToUtf8( $codepoint );
01346                 } else {
01347                         return UTF8_REPLACEMENT;
01348                 }
01349         }
01350 
01359         static function decodeEntity( $name ) {
01360                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
01361                         $name = self::$htmlEntityAliases[$name];
01362                 }
01363                 if( isset( self::$htmlEntities[$name] ) ) {
01364                         return codepointToUtf8( self::$htmlEntities[$name] );
01365                 } else {
01366                         return "&$name;";
01367                 }
01368         }
01369 
01376         static function attributeWhitelist( $element ) {
01377                 static $list;
01378                 if( !isset( $list ) ) {
01379                         $list = Sanitizer::setupAttributeWhitelist();
01380                 }
01381                 return isset( $list[$element] )
01382                         ? $list[$element]
01383                         : array();
01384         }
01385 
01391         static function setupAttributeWhitelist() {
01392                 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
01393 
01394                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
01395 
01396                 if ( $wgAllowRdfaAttributes ) {
01397                         #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
01398                         $common = array_merge( $common, array(
01399                             'about', 'property', 'resource', 'datatype', 'typeof',
01400                         ) );
01401                 }
01402 
01403                 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
01404                         # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
01405                         $common = array_merge( $common, array(
01406                             'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
01407                         ) );
01408                 }
01409 
01410                 $block = array_merge( $common, array( 'align' ) );
01411                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
01412                 $tablecell = array( 'abbr',
01413                                     'axis',
01414                                     'headers',
01415                                     'scope',
01416                                     'rowspan',
01417                                     'colspan',
01418                                     'nowrap', # deprecated
01419                                     'width',  # deprecated
01420                                     'height', # deprecated
01421                                     'bgcolor' # deprecated
01422                                     );
01423 
01424                 # Numbers refer to sections in HTML 4.01 standard describing the element.
01425                 # See: http://www.w3.org/TR/html4/
01426                 $whitelist = array (
01427                         # 7.5.4
01428                         'div'        => $block,
01429                         'center'     => $common, # deprecated
01430                         'span'       => $block, # ??
01431 
01432                         # 7.5.5
01433                         'h1'         => $block,
01434                         'h2'         => $block,
01435                         'h3'         => $block,
01436                         'h4'         => $block,
01437                         'h5'         => $block,
01438                         'h6'         => $block,
01439 
01440                         # 7.5.6
01441                         # address
01442 
01443                         # 8.2.4
01444                         # bdo
01445 
01446                         # 9.2.1
01447                         'em'         => $common,
01448                         'strong'     => $common,
01449                         'cite'       => $common,
01450                         'dfn'        => $common,
01451                         'code'       => $common,
01452                         'samp'       => $common,
01453                         'kbd'        => $common,
01454                         'var'        => $common,
01455                         'abbr'       => $common,
01456                         # acronym
01457 
01458                         # 9.2.2
01459                         'blockquote' => array_merge( $common, array( 'cite' ) ),
01460                         # q
01461 
01462                         # 9.2.3
01463                         'sub'        => $common,
01464                         'sup'        => $common,
01465 
01466                         # 9.3.1
01467                         'p'          => $block,
01468 
01469                         # 9.3.2
01470                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
01471 
01472                         # 9.3.4
01473                         'pre'        => array_merge( $common, array( 'width' ) ),
01474 
01475                         # 9.4
01476                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01477                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01478 
01479                         # 10.2
01480                         'ul'         => array_merge( $common, array( 'type' ) ),
01481                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
01482                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
01483 
01484                         # 10.3
01485                         'dl'         => $common,
01486                         'dd'         => $common,
01487                         'dt'         => $common,
01488 
01489                         # 11.2.1
01490                         'table'      => array_merge( $common,
01491                                                                 array( 'summary', 'width', 'border', 'frame',
01492                                                                                 'rules', 'cellspacing', 'cellpadding',
01493                                                                                 'align', 'bgcolor',
01494                                                                 ) ),
01495 
01496                         # 11.2.2
01497                         'caption'    => array_merge( $common, array( 'align' ) ),
01498 
01499                         # 11.2.3
01500                         'thead'      => array_merge( $common, $tablealign ),
01501                         'tfoot'      => array_merge( $common, $tablealign ),
01502                         'tbody'      => array_merge( $common, $tablealign ),
01503 
01504                         # 11.2.4
01505                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01506                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01507 
01508                         # 11.2.5
01509                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01510 
01511                         # 11.2.6
01512                         'td'         => array_merge( $common, $tablecell, $tablealign ),
01513                         'th'         => array_merge( $common, $tablecell, $tablealign ),
01514 
01515                         # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
01516                         'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
01517 
01518                         # 13.2
01519                         # Not usually allowed, but may be used for extension-style hooks
01520                         # such as <math> when it is rasterized, or if $wgAllowImageTag is
01521                         # true
01522                         'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
01523 
01524                         # 15.2.1
01525                         'tt'         => $common,
01526                         'b'          => $common,
01527                         'i'          => $common,
01528                         'big'        => $common,
01529                         'small'      => $common,
01530                         'strike'     => $common,
01531                         's'          => $common,
01532                         'u'          => $common,
01533 
01534                         # 15.2.2
01535                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
01536                         # basefont
01537 
01538                         # 15.3
01539                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
01540 
01541                         # XHTML Ruby annotation text module, simple ruby only.
01542                         # http://www.w3c.org/TR/ruby/
01543                         'ruby'       => $common,
01544                         # rbc
01545                         # rtc
01546                         'rb'         => $common,
01547                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
01548                         'rp'         => $common,
01549 
01550                         # MathML root element, where used for extensions
01551                         # 'title' may not be 100% valid here; it's XHTML
01552                         # http://www.w3.org/TR/REC-MathML/
01553                         'math'       => array( 'class', 'style', 'id', 'title' ),
01554 
01555                         # HTML 5 section 4.6
01556                         'bdi' => $common,
01557 
01558                         );
01559                 return $whitelist;
01560         }
01561 
01572         static function stripAllTags( $text ) {
01573                 # Actual <tags>
01574                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01575 
01576                 # Normalize &entities and whitespace
01577                 $text = self::decodeCharReferences( $text );
01578                 $text = self::normalizeWhitespace( $text );
01579 
01580                 return $text;
01581         }
01582 
01592         static function hackDocType() {
01593                 $out = "<!DOCTYPE html [\n";
01594                 foreach( self::$htmlEntities as $entity => $codepoint ) {
01595                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01596                 }
01597                 $out .= "]>\n";
01598                 return $out;
01599         }
01600 
01605         static function cleanUrl( $url ) {
01606                 # Normalize any HTML entities in input. They will be
01607                 # re-escaped by makeExternalLink().
01608                 $url = Sanitizer::decodeCharReferences( $url );
01609 
01610                 # Escape any control characters introduced by the above step
01611                 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
01612                         array( __CLASS__, 'cleanUrlCallback' ), $url );
01613 
01614                 # Validate hostname portion
01615                 $matches = array();
01616                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01617                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
01618 
01619                         // Characters that will be ignored in IDNs.
01620                         // http://tools.ietf.org/html/3454#section-3.1
01621                         // Strip them before further processing so blacklists and such work.
01622                         $strip = "/
01623                                 \\s|          # general whitespace
01624                                 \xc2\xad|     # 00ad SOFT HYPHEN
01625                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01626                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01627                                 \xe2\x81\xa0| # 2060 WORD JOINER
01628                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01629                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
01630                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01631                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01632                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01633                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01634                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01635                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
01636                                 /xuD";
01637 
01638                         $host = preg_replace( $strip, '', $host );
01639 
01640                         // @todo FIXME: Validate hostnames here
01641 
01642                         return $protocol . $host . $rest;
01643                 } else {
01644                         return $url;
01645                 }
01646         }
01647 
01652         static function cleanUrlCallback( $matches ) {
01653                 return urlencode( $matches[0] );
01654         }
01655 
01684         public static function validateEmail( $addr ) {
01685                 $result = null;
01686                 if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
01687                         return $result;
01688                 }
01689 
01690                 // Please note strings below are enclosed in brackets [], this make the
01691                 // hyphen "-" a range indicator. Hence it is double backslashed below.
01692                 // See bug 26948
01693                 $rfc5322_atext   = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ;
01694                 $rfc1034_ldh_str = "a-z0-9\\-" ;
01695 
01696                 $HTML5_email_regexp = "/
01697                 ^                      # start of string
01698                 [$rfc5322_atext\\.]+    # user part which is liberal :p
01699                 @                      # 'apostrophe'
01700                 [$rfc1034_ldh_str]+       # First domain part
01701                 (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
01702                 $                      # End of string
01703                 /ix" ; // case Insensitive, eXtended
01704 
01705                 return (bool) preg_match( $HTML5_email_regexp, $addr );
01706         }
01707 }