php/html/Sanitizer_8php_source.html

00001 <?php
00031 class Sanitizer {
00036         const CHAR_REFS_REGEX =
00037                 '/&([A-Za-z0-9\x80-\xff]+);
00038                  |&\#([0-9]+);
00039                  |&\#[xX]([0-9A-Fa-f]+);
00040                  |(&)/x';
00041
00050         const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
00051         const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
00052
00059         static $htmlEntities = array(
00060                 'Aacute'   => 193,
00061                 'aacute'   => 225,
00062                 'Acirc'    => 194,
00063                 'acirc'    => 226,
00064                 'acute'    => 180,
00065                 'AElig'    => 198,
00066                 'aelig'    => 230,
00067                 'Agrave'   => 192,
00068                 'agrave'   => 224,
00069                 'alefsym'  => 8501,
00070                 'Alpha'    => 913,
00071                 'alpha'    => 945,
00072                 'amp'      => 38,
00073                 'and'      => 8743,
00074                 'ang'      => 8736,
00075                 'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
00076                 'Aring'    => 197,
00077                 'aring'    => 229,
00078                 'asymp'    => 8776,
00079                 'Atilde'   => 195,
00080                 'atilde'   => 227,
00081                 'Auml'     => 196,
00082                 'auml'     => 228,
00083                 'bdquo'    => 8222,
00084                 'Beta'     => 914,
00085                 'beta'     => 946,
00086                 'brvbar'   => 166,
00087                 'bull'     => 8226,
00088                 'cap'      => 8745,
00089                 'Ccedil'   => 199,
00090                 'ccedil'   => 231,
00091                 'cedil'    => 184,
00092                 'cent'     => 162,
00093                 'Chi'      => 935,
00094                 'chi'      => 967,
00095                 'circ'     => 710,
00096                 'clubs'    => 9827,
00097                 'cong'     => 8773,
00098                 'copy'     => 169,
00099                 'crarr'    => 8629,
00100                 'cup'      => 8746,
00101                 'curren'   => 164,
00102                 'dagger'   => 8224,
00103                 'Dagger'   => 8225,
00104                 'darr'     => 8595,
00105                 'dArr'     => 8659,
00106                 'deg'      => 176,
00107                 'Delta'    => 916,
00108                 'delta'    => 948,
00109                 'diams'    => 9830,
00110                 'divide'   => 247,
00111                 'Eacute'   => 201,
00112                 'eacute'   => 233,
00113                 'Ecirc'    => 202,
00114                 'ecirc'    => 234,
00115                 'Egrave'   => 200,
00116                 'egrave'   => 232,
00117                 'empty'    => 8709,
00118                 'emsp'     => 8195,
00119                 'ensp'     => 8194,
00120                 'Epsilon'  => 917,
00121                 'epsilon'  => 949,
00122                 'equiv'    => 8801,
00123                 'Eta'      => 919,
00124                 'eta'      => 951,
00125                 'ETH'      => 208,
00126                 'eth'      => 240,
00127                 'Euml'     => 203,
00128                 'euml'     => 235,
00129                 'euro'     => 8364,
00130                 'exist'    => 8707,
00131                 'fnof'     => 402,
00132                 'forall'   => 8704,
00133                 'frac12'   => 189,
00134                 'frac14'   => 188,
00135                 'frac34'   => 190,
00136                 'frasl'    => 8260,
00137                 'Gamma'    => 915,
00138                 'gamma'    => 947,
00139                 'ge'       => 8805,
00140                 'gt'       => 62,
00141                 'harr'     => 8596,
00142                 'hArr'     => 8660,
00143                 'hearts'   => 9829,
00144                 'hellip'   => 8230,
00145                 'Iacute'   => 205,
00146                 'iacute'   => 237,
00147                 'Icirc'    => 206,
00148                 'icirc'    => 238,
00149                 'iexcl'    => 161,
00150                 'Igrave'   => 204,
00151                 'igrave'   => 236,
00152                 'image'    => 8465,
00153                 'infin'    => 8734,
00154                 'int'      => 8747,
00155                 'Iota'     => 921,
00156                 'iota'     => 953,
00157                 'iquest'   => 191,
00158                 'isin'     => 8712,
00159                 'Iuml'     => 207,
00160                 'iuml'     => 239,
00161                 'Kappa'    => 922,
00162                 'kappa'    => 954,
00163                 'Lambda'   => 923,
00164                 'lambda'   => 955,
00165                 'lang'     => 9001,
00166                 'laquo'    => 171,
00167                 'larr'     => 8592,
00168                 'lArr'     => 8656,
00169                 'lceil'    => 8968,
00170                 'ldquo'    => 8220,
00171                 'le'       => 8804,
00172                 'lfloor'   => 8970,
00173                 'lowast'   => 8727,
00174                 'loz'      => 9674,
00175                 'lrm'      => 8206,
00176                 'lsaquo'   => 8249,
00177                 'lsquo'    => 8216,
00178                 'lt'       => 60,
00179                 'macr'     => 175,
00180                 'mdash'    => 8212,
00181                 'micro'    => 181,
00182                 'middot'   => 183,
00183                 'minus'    => 8722,
00184                 'Mu'       => 924,
00185                 'mu'       => 956,
00186                 'nabla'    => 8711,
00187                 'nbsp'     => 160,
00188                 'ndash'    => 8211,
00189                 'ne'       => 8800,
00190                 'ni'       => 8715,
00191                 'not'      => 172,
00192                 'notin'    => 8713,
00193                 'nsub'     => 8836,
00194                 'Ntilde'   => 209,
00195                 'ntilde'   => 241,
00196                 'Nu'       => 925,
00197                 'nu'       => 957,
00198                 'Oacute'   => 211,
00199                 'oacute'   => 243,
00200                 'Ocirc'    => 212,
00201                 'ocirc'    => 244,
00202                 'OElig'    => 338,
00203                 'oelig'    => 339,
00204                 'Ograve'   => 210,
00205                 'ograve'   => 242,
00206                 'oline'    => 8254,
00207                 'Omega'    => 937,
00208                 'omega'    => 969,
00209                 'Omicron'  => 927,
00210                 'omicron'  => 959,
00211                 'oplus'    => 8853,
00212                 'or'       => 8744,
00213                 'ordf'     => 170,
00214                 'ordm'     => 186,
00215                 'Oslash'   => 216,
00216                 'oslash'   => 248,
00217                 'Otilde'   => 213,
00218                 'otilde'   => 245,
00219                 'otimes'   => 8855,
00220                 'Ouml'     => 214,
00221                 'ouml'     => 246,
00222                 'para'     => 182,
00223                 'part'     => 8706,
00224                 'permil'   => 8240,
00225                 'perp'     => 8869,
00226                 'Phi'      => 934,
00227                 'phi'      => 966,
00228                 'Pi'       => 928,
00229                 'pi'       => 960,
00230                 'piv'      => 982,
00231                 'plusmn'   => 177,
00232                 'pound'    => 163,
00233                 'prime'    => 8242,
00234                 'Prime'    => 8243,
00235                 'prod'     => 8719,
00236                 'prop'     => 8733,
00237                 'Psi'      => 936,
00238                 'psi'      => 968,
00239                 'quot'     => 34,
00240                 'radic'    => 8730,
00241                 'rang'     => 9002,
00242                 'raquo'    => 187,
00243                 'rarr'     => 8594,
00244                 'rArr'     => 8658,
00245                 'rceil'    => 8969,
00246                 'rdquo'    => 8221,
00247                 'real'     => 8476,
00248                 'reg'      => 174,
00249                 'rfloor'   => 8971,
00250                 'Rho'      => 929,
00251                 'rho'      => 961,
00252                 'rlm'      => 8207,
00253                 'rsaquo'   => 8250,
00254                 'rsquo'    => 8217,
00255                 'sbquo'    => 8218,
00256                 'Scaron'   => 352,
00257                 'scaron'   => 353,
00258                 'sdot'     => 8901,
00259                 'sect'     => 167,
00260                 'shy'      => 173,
00261                 'Sigma'    => 931,
00262                 'sigma'    => 963,
00263                 'sigmaf'   => 962,
00264                 'sim'      => 8764,
00265                 'spades'   => 9824,
00266                 'sub'      => 8834,
00267                 'sube'     => 8838,
00268                 'sum'      => 8721,
00269                 'sup'      => 8835,
00270                 'sup1'     => 185,
00271                 'sup2'     => 178,
00272                 'sup3'     => 179,
00273                 'supe'     => 8839,
00274                 'szlig'    => 223,
00275                 'Tau'      => 932,
00276                 'tau'      => 964,
00277                 'there4'   => 8756,
00278                 'Theta'    => 920,
00279                 'theta'    => 952,
00280                 'thetasym' => 977,
00281                 'thinsp'   => 8201,
00282                 'THORN'    => 222,
00283                 'thorn'    => 254,
00284                 'tilde'    => 732,
00285                 'times'    => 215,
00286                 'trade'    => 8482,
00287                 'Uacute'   => 218,
00288                 'uacute'   => 250,
00289                 'uarr'     => 8593,
00290                 'uArr'     => 8657,
00291                 'Ucirc'    => 219,
00292                 'ucirc'    => 251,
00293                 'Ugrave'   => 217,
00294                 'ugrave'   => 249,
00295                 'uml'      => 168,
00296                 'upsih'    => 978,
00297                 'Upsilon'  => 933,
00298                 'upsilon'  => 965,
00299                 'Uuml'     => 220,
00300                 'uuml'     => 252,
00301                 'weierp'   => 8472,
00302                 'Xi'       => 926,
00303                 'xi'       => 958,
00304                 'Yacute'   => 221,
00305                 'yacute'   => 253,
00306                 'yen'      => 165,
00307                 'Yuml'     => 376,
00308                 'yuml'     => 255,
00309                 'Zeta'     => 918,
00310                 'zeta'     => 950,
00311                 'zwj'      => 8205,
00312                 'zwnj'     => 8204
00313         );
00314
00318         static $htmlEntityAliases = array(
00319                 'רלמ' => 'rlm',
00320                 'رلم' => 'rlm',
00321         );
00322
00326         static $attribsRegex;
00327
00333         static function getAttribsRegex() {
00334                 if ( self::$attribsRegex === null ) {
00335                         $attribFirst = '[:A-Z_a-z0-9]';
00336                         $attrib = '[:A-Z_a-z-.0-9]';
00337                         $space = '[\x09\x0a\x0d\x20]';
00338                         self::$attribsRegex =
00339                                 "/(?:^|$space)({$attribFirst}{$attrib}*)
00340                                   ($space*=$space*
00341                                         (?:
00342                                          # The attribute value: quoted or alone
00343                                           \"([^<\"]*)\"
00344                                          | '([^<']*)'
00345                                          |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00346                                          |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00347                                                                                  # colors are specified like this.
00348                                                                                  # We'll be normalizing it.
00349                                         )
00350                                 )?(?=$space|\$)/sx";
00351                 }
00352                 return self::$attribsRegex;
00353         }
00354
00366         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
00367                 global $wgUseTidy;
00368
00369                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00370                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
00371
00372                 wfProfileIn( __METHOD__ );
00373
00374                 if ( !$staticInitialised ) {
00375
00376                         $htmlpairsStatic = array( # Tags that must be closed
00377                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00378                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00379                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
00380                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00381                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
00382                                 'kbd', 'samp'
00383                         );
00384                         $htmlsingle = array(
00385                                 'br', 'hr', 'li', 'dt', 'dd'
00386                         );
00387                         $htmlsingleonly = array( # Elements that cannot have close tags
00388                                 'br', 'hr'
00389                         );
00390                         $htmlnest = array( # Tags that can be nested--??
00391                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00392                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
00393                         );
00394                         $tabletags = array( # Can only appear inside table, we will close them
00395                                 'td', 'th', 'tr',
00396                         );
00397                         $htmllist = array( # Tags used by list
00398                                 'ul','ol',
00399                         );
00400                         $listtags = array( # Tags that can appear in a list
00401                                 'li',
00402                         );
00403
00404                         global $wgAllowImageTag;
00405                         if ( $wgAllowImageTag ) {
00406                                 $htmlsingle[] = 'img';
00407                                 $htmlsingleonly[] = 'img';
00408                         }
00409
00410                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
00411                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
00412
00413                         # Convert them all to hashtables for faster lookup
00414                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00415                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
00416                         foreach ( $vars as $var ) {
00417                                 $$var = array_flip( $$var );
00418                         }
00419                         $staticInitialised = true;
00420                 }
00421                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
00422                 $extratags = array_flip( $extratags );
00423                 $removetags = array_flip( $removetags );
00424                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
00425                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
00426
00427                 # Remove HTML comments
00428                 $text = Sanitizer::removeHTMLcomments( $text );
00429                 $bits = explode( '<', $text );
00430                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
00431                 if ( !$wgUseTidy ) {
00432                         $tagstack = $tablestack = array();
00433                         foreach ( $bits as $x ) {
00434                                 $regs = array();
00435                                 # $slash: Does the current element start with a '/'?
00436                                 # $t: Current element name
00437                                 # $params: String between element name and >
00438                                 # $brace: Ending '>' or '/>'
00439                                 # $rest: Everything until the next element of $bits
00440                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00441                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00442                                 } else {
00443                                         $slash = $t = $params = $brace = $rest = null;
00444                                 }
00445
00446                                 $badtag = false;
00447                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00448                                         # Check our stack
00449                                         if ( $slash && isset( $htmlsingleonly[$t] ) ) {
00450                                                 $badtag = true;
00451                                         } elseif ( $slash ) {
00452                                                 # Closing a tag... is it the one we just opened?
00453                                                 $ot = @array_pop( $tagstack );
00454                                                 if ( $ot != $t ) {
00455                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
00456                                                                 # Pop all elements with an optional close tag
00457                                                                 # and see if we find a match below them
00458                                                                 $optstack = array();
00459                                                                 array_push( $optstack, $ot );
00460                                                                 wfSuppressWarnings();
00461                                                                 $ot = array_pop( $tagstack );
00462                                                                 wfRestoreWarnings();
00463                                                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
00464                                                                         array_push( $optstack, $ot );
00465                                                                         wfSuppressWarnings();
00466                                                                         $ot = array_pop( $tagstack );
00467                                                                         wfRestoreWarnings();
00468                                                                 }
00469                                                                 if ( $t != $ot ) {
00470                                                                         # No match. Push the optional elements back again
00471                                                                         $badtag = true;
00472                                                                         wfSuppressWarnings();
00473                                                                         $ot = array_pop( $optstack );
00474                                                                         wfRestoreWarnings();
00475                                                                         while ( $ot ) {
00476                                                                                 array_push( $tagstack, $ot );
00477                                                                                 wfSuppressWarnings();
00478                                                                                 $ot = array_pop( $optstack );
00479                                                                                 wfRestoreWarnings();
00480                                                                         }
00481                                                                 }
00482                                                         } else {
00483                                                                 @array_push( $tagstack, $ot );
00484                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
00485                                                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
00486                                                                         $badtag = true;
00487                                                                 }
00488                                                         }
00489                                                 } else {
00490                                                         if ( $t == 'table' ) {
00491                                                                 $tagstack = array_pop( $tablestack );
00492                                                         }
00493                                                 }
00494                                                 $newparams = '';
00495                                         } else {
00496                                                 # Keep track for later
00497                                                 if ( isset( $tabletags[$t] ) &&
00498                                                 !in_array( 'table', $tagstack ) ) {
00499                                                         $badtag = true;
00500                                                 } elseif ( in_array( $t, $tagstack ) &&
00501                                                 !isset( $htmlnest [$t ] ) ) {
00502                                                         $badtag = true;
00503                                                 # Is it a self closed htmlpair ? (bug 5487)
00504                                                 } elseif ( $brace == '/>' &&
00505                                                 isset( $htmlpairs[$t] ) ) {
00506                                                         $badtag = true;
00507                                                 } elseif ( isset( $htmlsingleonly[$t] ) ) {
00508                                                         # Hack to force empty tag for uncloseable elements
00509                                                         $brace = '/>';
00510                                                 } elseif ( isset( $htmlsingle[$t] ) ) {
00511                                                         # Hack to not close $htmlsingle tags
00512                                                         $brace = null;
00513                                                 } elseif ( isset( $tabletags[$t] )
00514                                                 && in_array( $t, $tagstack ) ) {
00515                                                         // New table tag but forgot to close the previous one
00516                                                         $text .= "</$t>";
00517                                                 } else {
00518                                                         if ( $t == 'table' ) {
00519                                                                 array_push( $tablestack, $tagstack );
00520                                                                 $tagstack = array();
00521                                                         }
00522                                                         array_push( $tagstack, $t );
00523                                                 }
00524
00525                                                 # Replace any variables or template parameters with
00526                                                 # plaintext results.
00527                                                 if( is_callable( $processCallback ) ) {
00528                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
00529                                                 }
00530
00531                                                 # Strip non-approved attributes from the tag
00532                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00533                                         }
00534                                         if ( !$badtag ) {
00535                                                 $rest = str_replace( '>', '&gt;', $rest );
00536                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00537                                                 $text .= "<$slash$t$newparams$close>$rest";
00538                                                 continue;
00539                                         }
00540                                 }
00541                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00542                         }
00543                         # Close off any remaining tags
00544                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
00545                                 $text .= "</$t>\n";
00546                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
00547                         }
00548                 } else {
00549                         # this might be possible using tidy itself
00550                         foreach ( $bits as $x ) {
00551                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00552                                 $x, $regs );
00553                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00554                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00555                                         if( is_callable( $processCallback ) ) {
00556                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
00557                                         }
00558                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
00559                                         $rest = str_replace( '>', '&gt;', $rest );
00560                                         $text .= "<$slash$t$newparams$brace$rest";
00561                                 } else {
00562                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00563                                 }
00564                         }
00565                 }
00566                 wfProfileOut( __METHOD__ );
00567                 return $text;
00568         }
00569
00580         static function removeHTMLcomments( $text ) {
00581                 wfProfileIn( __METHOD__ );
00582                 while (($start = strpos($text, '<!--')) !== false) {
00583                         $end = strpos($text, '-->', $start + 4);
00584                         if ($end === false) {
00585                                 # Unterminated comment; bail out
00586                                 break;
00587                         }
00588
00589                         $end += 3;
00590
00591                         # Trim space and newline if the comment is both
00592                         # preceded and followed by a newline
00593                         $spaceStart = max($start - 1, 0);
00594                         $spaceLen = $end - $spaceStart;
00595                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
00596                                 $spaceStart--;
00597                                 $spaceLen++;
00598                         }
00599                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
00600                                 $spaceLen++;
00601                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
00602                                 # Remove the comment, leading and trailing
00603                                 # spaces, and leave only one newline.
00604                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
00605                         }
00606                         else {
00607                                 # Remove just the comment.
00608                                 $text = substr_replace($text, '', $start, $end - $start);
00609                         }
00610                 }
00611                 wfProfileOut( __METHOD__ );
00612                 return $text;
00613         }
00614
00628         static function fixDeprecatedAttributes( $attribs, $element ) {
00629                 global $wgHtml5, $wgCleanupPresentationalAttributes;
00630
00631                 // presentational attributes were removed from html5, we can leave them
00632                 // in when html5 is turned off
00633                 if ( !$wgHtml5 || !$wgCleanupPresentationalAttributes ) {
00634                         return $attribs;
00635                 }
00636
00637                 $table = array( 'table' );
00638                 $cells = array( 'td', 'th' );
00639                 $colls = array( 'col', 'colgroup' );
00640                 $tblocks = array( 'tbody', 'tfoot', 'thead' );
00641                 $h = array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' );
00642
00643                 $presentationalAttribs = array(
00644                         'align' => array( 'text-align', array_merge( array( 'caption', 'hr', 'div', 'p', 'tr' ), $table, $cells, $colls, $tblocks, $h ) ),
00645                         'clear' => array( 'clear', array( 'br' ) ),
00646                         'height' => array( 'height', $cells ),
00647                         'nowrap' => array( 'white-space', $cells ),
00648                         'size' => array( 'height', array( 'hr' ) ),
00649                         'type' => array( 'list-style-type', array( 'li', 'ol', 'ul' ) ),
00650                         'valign' => array( 'vertical-align', array_merge( $cells, $colls, $tblocks ) ),
00651                         'width' => array( 'width', array_merge( array( 'hr', 'pre' ), $table, $cells, $colls ) ),
00652                 );
00653
00654                 // Ensure that any upper case or mixed case attributes are converted to lowercase
00655                 foreach ( $attribs as $attribute => $value ) {
00656                         if ( $attribute !== strtolower( $attribute ) && array_key_exists( strtolower( $attribute ), $presentationalAttribs ) ) {
00657                                 $attribs[strtolower( $attribute )] = $value;
00658                                 unset( $attribs[$attribute] );
00659                         }
00660                 }
00661
00662                 $style = "";
00663                 foreach ( $presentationalAttribs as $attribute => $info ) {
00664                         list( $property, $elements ) = $info;
00665
00666                         // Skip if this attribute is not relevant to this element
00667                         if ( !in_array( $element, $elements ) ) {
00668                                 continue;
00669                         }
00670
00671                         // Skip if the attribute is not used
00672                         if ( !array_key_exists( $attribute, $attribs ) ) {
00673                                 continue;
00674                         }
00675
00676                         $value = $attribs[$attribute];
00677
00678                         // For nowrap the value should be nowrap instead of whatever text is in the value
00679                         if ( $attribute === 'nowrap' ) {
00680                                 $value = 'nowrap';
00681                         }
00682
00683                         // clear="all" is clear: both; in css
00684                         if ( $attribute === 'clear' && strtolower( $value ) === 'all' ) {
00685                                 $value = 'both';
00686                         }
00687
00688                         // Size based properties should have px applied to them if they have no unit
00689                         if ( in_array( $attribute, array( 'height', 'width', 'size' ) ) ) {
00690                                 if ( preg_match( '/^[\d.]+$/', $value ) ) {
00691                                         $value = "{$value}px";
00692                                 }
00693                         }
00694
00695                         $style .= " $property: $value;";
00696
00697                         unset( $attribs[$attribute] );
00698                 }
00699
00700                 if ( $style ) {
00701                         // Prepend our style rules so that they can be overridden by user css
00702                         if ( isset($attribs['style']) ) {
00703                                 $style .= " " . $attribs['style'];
00704                         }
00705                         $attribs['style'] = trim($style);
00706                 }
00707
00708                 return $attribs;
00709         }
00710
00726         static function validateTagAttributes( $attribs, $element ) {
00727                 return Sanitizer::validateAttributes( $attribs,
00728                         Sanitizer::attributeWhitelist( $element ) );
00729         }
00730
00746         static function validateAttributes( $attribs, $whitelist ) {
00747                 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
00748
00749                 $whitelist = array_flip( $whitelist );
00750                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
00751
00752                 $out = array();
00753                 foreach( $attribs as $attribute => $value ) {
00754                         #allow XML namespace declaration if RDFa is enabled
00755                         if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
00756                                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00757                                         $out[$attribute] = $value;
00758                                 }
00759
00760                                 continue;
00761                         }
00762
00763                         # Allow any attribute beginning with "data-", if in HTML5 mode
00764                         if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
00765                                 continue;
00766                         }
00767
00768                         # Strip javascript "expression" from stylesheets.
00769                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00770                         if( $attribute == 'style' ) {
00771                                 $value = Sanitizer::checkCss( $value );
00772                         }
00773
00774                         if ( $attribute === 'id' ) {
00775                                 $value = Sanitizer::escapeId( $value, 'noninitial' );
00776                         }
00777
00778                         //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
00779                         if ( $attribute === 'rel' || $attribute === 'rev' ||
00780                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
00781                                 $attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
00782                                 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
00783                                 $attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
00784
00785                                 //Paranoia. Allow "simple" values but suppress javascript
00786                                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00787                                         continue;
00788                                 }
00789                         }
00790
00791                         # NOTE: even though elements using href/src are not allowed directly, supply
00792                         #       validation code that can be used by tag hook handlers, etc
00793                         if ( $attribute === 'href' || $attribute === 'src' ) {
00794                                 if ( !preg_match( $hrefExp, $value ) ) {
00795                                         continue; //drop any href or src attributes not using an allowed protocol.
00796                                                   //NOTE: this also drops all relative URLs
00797                                 }
00798                         }
00799
00800                         // If this attribute was previously set, override it.
00801                         // Output should only have one attribute of each name.
00802                         $out[$attribute] = $value;
00803                 }
00804
00805                 if ( $wgAllowMicrodataAttributes ) {
00806                         # itemtype, itemid, itemref don't make sense without itemscope
00807                         if ( !array_key_exists( 'itemscope', $out ) ) {
00808                                 unset( $out['itemtype'] );
00809                                 unset( $out['itemid'] );
00810                                 unset( $out['itemref'] );
00811                         }
00812                         # TODO: Strip itemprop if we aren't descendants of an itemscope.
00813                 }
00814                 return $out;
00815         }
00816
00827         static function mergeAttributes( $a, $b ) {
00828                 $out = array_merge( $a, $b );
00829                 if( isset( $a['class'] ) && isset( $b['class'] )
00830                 && is_string( $a['class'] ) && is_string( $b['class'] )
00831                 && $a['class'] !== $b['class'] ) {
00832                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00833                                 -1, PREG_SPLIT_NO_EMPTY );
00834                         $out['class'] = implode( ' ', array_unique( $classes ) );
00835                 }
00836                 return $out;
00837         }
00838
00848         public static function normalizeCss( $value ) {
00849
00850                 // Decode character references like &#123;
00851                 $value = Sanitizer::decodeCharReferences( $value );
00852
00853                 // Decode escape sequences and line continuation
00854                 // See the grammar in the CSS 2 spec, appendix D.
00855                 // This has to be done AFTER decoding character references.
00856                 // This means it isn't possible for this function to return
00857                 // unsanitized escape sequences. It is possible to manufacture
00858                 // input that contains character references that decode to
00859                 // escape sequences that decode to character references, but
00860                 // it's OK for the return value to contain character references
00861                 // because the caller is supposed to escape those anyway.
00862                 static $decodeRegex;
00863                 if ( !$decodeRegex ) {
00864                         $space = '[\\x20\\t\\r\\n\\f]';
00865                         $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00866                         $backslash = '\\\\';
00867                         $decodeRegex = "/ $backslash
00868                                 (?:
00869                                         ($nl) |  # 1. Line continuation
00870                                         ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
00871                                         (.) | # 3. backslash cancelling special meaning
00872                                         () | # 4. backslash at end of string
00873                                 )/xu";
00874                 }
00875                 $value = preg_replace_callback( $decodeRegex,
00876                         array( __CLASS__, 'cssDecodeCallback' ), $value );
00877
00878                 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
00879                 $value = preg_replace_callback(
00880                         '/[！-［］-ｚ]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
00881                         array( __CLASS__, 'cssNormalizeUnicodeWidth' ),
00882                         $value
00883                 );
00884
00885                 // Convert more characters IE6 might treat as ascii
00886                 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
00887                 $value = str_replace(
00888                         array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
00889                         array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
00890                         $value
00891                 );
00892
00893                 // Remove any comments; IE gets token splitting wrong
00894                 // This must be done AFTER decoding character references and
00895                 // escape sequences, because those steps can introduce comments
00896                 // This step cannot introduce character references or escape
00897                 // sequences, because it replaces comments with spaces rather
00898                 // than removing them completely.
00899                 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00900
00901                 // Remove anything after a comment-start token, to guard against
00902                 // incorrect client implementations.
00903                 $commentPos = strpos( $value, '/*' );
00904                 if ( $commentPos !== false ) {
00905                         $value = substr( $value, 0, $commentPos );
00906                 }
00907
00908                 // S followed by repeat, iteration, or prolonged sound marks,
00909                 // which IE will treat as "ss"
00910                 $value = preg_replace(
00911                         '/s(?:
00912                                 \xE3\x80\xB1 | # U+3031
00913                                 \xE3\x82\x9D | # U+309D
00914                                 \xE3\x83\xBC | # U+30FC
00915                                 \xE3\x83\xBD | # U+30FD
00916                                 \xEF\xB9\xBC | # U+FE7C
00917                                 \xEF\xB9\xBD | # U+FE7D
00918                                 \xEF\xBD\xB0   # U+FF70
00919                         )/ix',
00920                         'ss',
00921                         $value
00922                 );
00923
00924                 return $value;
00925         }
00926
00927
00946         static function checkCss( $value ) {
00947                 $value = self::normalizeCss( $value );
00948
00949                 // Reject problematic keywords and control characters
00950                 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
00951                         return '/* invalid control char */';
00952                 } elseif ( preg_match(
00953                         '! expression
00954                                 | filter\s*:
00955                                 | accelerator\s*:
00956                                 | -o-link\s*:
00957                                 | -o-link-source\s*:
00958                                 | -o-replace\s*:
00959                                 | url\s*\(
00960                                 | image\s*\(
00961                                 | image-set\s*\(
00962                         !ix', $value ) ) {                      return '/* insecure input */';
00963                 }
00964                 return $value;
00965         }
00966
00972         static function cssNormalizeUnicodeWidth( $matches ) {
00973                 $cp = utf8ToCodepoint( $matches[0] );
00974                 if ( $cp === false ) {
00975                         return '';
00976                 }
00977                 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
00978         }
00979
00984         static function cssDecodeCallback( $matches ) {
00985                 if ( $matches[1] !== '' ) {
00986                         // Line continuation
00987                         return '';
00988                 } elseif ( $matches[2] !== '' ) {
00989                         $char = codepointToUtf8( hexdec( $matches[2] ) );
00990                 } elseif ( $matches[3] !== '' ) {
00991                         $char = $matches[3];
00992                 } else {
00993                         $char = '\\';
00994                 }
00995                 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00996                         // These characters need to be escaped in strings
00997                         // Clean up the escape sequence to avoid parsing errors by clients
00998                         return '\\' . dechex( ord( $char ) ) . ' ';
00999                 } else {
01000                         // Decode unnecessary escape
01001                         return $char;
01002                 }
01003         }
01004
01024         static function fixTagAttributes( $text, $element ) {
01025                 if( trim( $text ) == '' ) {
01026                         return '';
01027                 }
01028
01029                 $decoded = Sanitizer::decodeTagAttributes( $text );
01030                 $decoded = Sanitizer::fixDeprecatedAttributes( $decoded, $element );
01031                 $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
01032
01033                 $attribs = array();
01034                 foreach( $stripped as $attribute => $value ) {
01035                         $encAttribute = htmlspecialchars( $attribute );
01036                         $encValue = Sanitizer::safeEncodeAttribute( $value );
01037
01038                         $attribs[] = "$encAttribute=\"$encValue\"";
01039                 }
01040                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
01041         }
01042
01048         static function encodeAttribute( $text ) {
01049                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
01050
01051                 // Whitespace is normalized during attribute decoding,
01052                 // so if we've been passed non-spaces we must encode them
01053                 // ahead of time or they won't be preserved.
01054                 $encValue = strtr( $encValue, array(
01055                         "\n" => '&#10;',
01056                         "\r" => '&#13;',
01057                         "\t" => '&#9;',
01058                 ) );
01059
01060                 return $encValue;
01061         }
01062
01069         static function safeEncodeAttribute( $text ) {
01070                 $encValue = Sanitizer::encodeAttribute( $text );
01071
01072                 # Templates and links may be expanded in later parsing,
01073                 # creating invalid or dangerous output. Suppress this.
01074                 $encValue = strtr( $encValue, array(
01075                         '<'    => '&lt;',   // This should never happen,
01076                         '>'    => '&gt;',   // we've received invalid input
01077                         '"'    => '&quot;', // which should have been escaped.
01078                         '{'    => '&#123;',
01079                         '['    => '&#91;',
01080                         "''"   => '&#39;&#39;',
01081                         'ISBN' => '&#73;SBN',
01082                         'RFC'  => '&#82;FC',
01083                         'PMID' => '&#80;MID',
01084                         '|'    => '&#124;',
01085                         '__'   => '&#95;_',
01086                 ) );
01087
01088                 # Stupid hack
01089                 $encValue = preg_replace_callback(
01090                         '/(' . wfUrlProtocols() . ')/',
01091                         array( 'Sanitizer', 'armorLinksCallback' ),
01092                         $encValue );
01093                 return $encValue;
01094         }
01095
01127         static function escapeId( $id, $options = array() ) {
01128                 global $wgHtml5, $wgExperimentalHtmlIds;
01129                 $options = (array)$options;
01130
01131                 if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
01132                         $id = Sanitizer::decodeCharReferences( $id );
01133                         $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
01134                         $id = trim( $id, '_' );
01135                         if ( $id === '' ) {
01136                                 # Must have been all whitespace to start with.
01137                                 return '_';
01138                         } else {
01139                                 return $id;
01140                         }
01141                 }
01142
01143                 # HTML4-style escaping
01144                 static $replace = array(
01145                         '%3A' => ':',
01146                         '%' => '.'
01147                 );
01148
01149                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
01150                 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
01151
01152                 if ( !preg_match( '/^[a-zA-Z]/', $id )
01153                 && !in_array( 'noninitial', $options ) )  {
01154                         // Initial character must be a letter!
01155                         $id = "x$id";
01156                 }
01157                 return $id;
01158         }
01159
01171         static function escapeClass( $class ) {
01172                 // Convert ugly stuff to underscores and kill underscores in ugly places
01173                 return rtrim(preg_replace(
01174                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
01175                         '_',
01176                         $class ), '_');
01177         }
01178
01186         static function escapeHtmlAllowEntities( $html ) {
01187                 $html = Sanitizer::decodeCharReferences( $html );
01188                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
01189                 # hurt.
01190                 $html = htmlspecialchars( $html, ENT_QUOTES );
01191                 return $html;
01192         }
01193
01199         private static function armorLinksCallback( $matches ) {
01200                 return str_replace( ':', '&#58;', $matches[1] );
01201         }
01202
01211         public static function decodeTagAttributes( $text ) {
01212                 if( trim( $text ) == '' ) {
01213                         return array();
01214                 }
01215
01216                 $attribs = array();
01217                 $pairs = array();
01218                 if( !preg_match_all(
01219                         self::getAttribsRegex(),
01220                         $text,
01221                         $pairs,
01222                         PREG_SET_ORDER ) ) {
01223                         return $attribs;
01224                 }
01225
01226                 foreach( $pairs as $set ) {
01227                         $attribute = strtolower( $set[1] );
01228                         $value = Sanitizer::getTagAttributeCallback( $set );
01229
01230                         // Normalize whitespace
01231                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
01232                         $value = trim( $value );
01233
01234                         // Decode character references
01235                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
01236                 }
01237                 return $attribs;
01238         }
01239
01247         private static function getTagAttributeCallback( $set ) {
01248                 if( isset( $set[6] ) ) {
01249                         # Illegal #XXXXXX color with no quotes.
01250                         return $set[6];
01251                 } elseif( isset( $set[5] ) ) {
01252                         # No quotes.
01253                         return $set[5];
01254                 } elseif( isset( $set[4] ) ) {
01255                         # Single-quoted
01256                         return $set[4];
01257                 } elseif( isset( $set[3] ) ) {
01258                         # Double-quoted
01259                         return $set[3];
01260                 } elseif( !isset( $set[2] ) ) {
01261                         # In XHTML, attributes must have a value.
01262                         # For 'reduced' form, return explicitly the attribute name here.
01263                         return $set[1];
01264                 } else {
01265                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
01266                 }
01267         }
01268
01280         private static function normalizeAttributeValue( $text ) {
01281                 return str_replace( '"', '&quot;',
01282                         self::normalizeWhitespace(
01283                                 Sanitizer::normalizeCharReferences( $text ) ) );
01284         }
01285
01290         private static function normalizeWhitespace( $text ) {
01291                 return preg_replace(
01292                         '/\r\n|[\x20\x0d\x0a\x09]/',
01293                         ' ',
01294                         $text );
01295         }
01296
01305         static function normalizeSectionNameWhitespace( $section ) {
01306                 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
01307         }
01308
01324         static function normalizeCharReferences( $text ) {
01325                 return preg_replace_callback(
01326                         self::CHAR_REFS_REGEX,
01327                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01328                         $text );
01329         }
01334         static function normalizeCharReferencesCallback( $matches ) {
01335                 $ret = null;
01336                 if( $matches[1] != '' ) {
01337                         $ret = Sanitizer::normalizeEntity( $matches[1] );
01338                 } elseif( $matches[2] != '' ) {
01339                         $ret = Sanitizer::decCharReference( $matches[2] );
01340                 } elseif( $matches[3] != ''  ) {
01341                         $ret = Sanitizer::hexCharReference( $matches[3] );
01342                 }
01343                 if( is_null( $ret ) ) {
01344                         return htmlspecialchars( $matches[0] );
01345                 } else {
01346                         return $ret;
01347                 }
01348         }
01349
01360         static function normalizeEntity( $name ) {
01361                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
01362                         return '&' . self::$htmlEntityAliases[$name] . ';';
01363                 } elseif ( in_array( $name,
01364                 array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
01365                         return "&$name;";
01366                 } elseif ( isset( self::$htmlEntities[$name] ) ) {
01367                         return '&#' . self::$htmlEntities[$name] . ';';
01368                 } else {
01369                         return "&amp;$name;";
01370                 }
01371         }
01372
01377         static function decCharReference( $codepoint ) {
01378                 $point = intval( $codepoint );
01379                 if( Sanitizer::validateCodepoint( $point ) ) {
01380                         return sprintf( '&#%d;', $point );
01381                 } else {
01382                         return null;
01383                 }
01384         }
01385
01390         static function hexCharReference( $codepoint ) {
01391                 $point = hexdec( $codepoint );
01392                 if( Sanitizer::validateCodepoint( $point ) ) {
01393                         return sprintf( '&#x%x;', $point );
01394                 } else {
01395                         return null;
01396                 }
01397         }
01398
01404         private static function validateCodepoint( $codepoint ) {
01405                 return ($codepoint ==    0x09)
01406                         || ($codepoint ==    0x0a)
01407                         || ($codepoint ==    0x0d)
01408                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
01409                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
01410                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
01411         }
01412
01420         public static function decodeCharReferences( $text ) {
01421                 return preg_replace_callback(
01422                         self::CHAR_REFS_REGEX,
01423                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01424                         $text );
01425         }
01426
01437         public static function decodeCharReferencesAndNormalize( $text ) {
01438                 global $wgContLang;
01439                 $text = preg_replace_callback(
01440                         self::CHAR_REFS_REGEX,
01441                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01442                         $text, /* limit */ -1, $count );
01443
01444                 if ( $count ) {
01445                         return $wgContLang->normalize( $text );
01446                 } else {
01447                         return $text;
01448                 }
01449         }
01450
01455         static function decodeCharReferencesCallback( $matches ) {
01456                 if( $matches[1] != '' ) {
01457                         return Sanitizer::decodeEntity( $matches[1] );
01458                 } elseif( $matches[2] != '' ) {
01459                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
01460                 } elseif( $matches[3] != ''  ) {
01461                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
01462                 }
01463                 # Last case should be an ampersand by itself
01464                 return $matches[0];
01465         }
01466
01474         static function decodeChar( $codepoint ) {
01475                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
01476                         return codepointToUtf8( $codepoint );
01477                 } else {
01478                         return UTF8_REPLACEMENT;
01479                 }
01480         }
01481
01490         static function decodeEntity( $name ) {
01491                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
01492                         $name = self::$htmlEntityAliases[$name];
01493                 }
01494                 if( isset( self::$htmlEntities[$name] ) ) {
01495                         return codepointToUtf8( self::$htmlEntities[$name] );
01496                 } else {
01497                         return "&$name;";
01498                 }
01499         }
01500
01507         static function attributeWhitelist( $element ) {
01508                 static $list;
01509                 if( !isset( $list ) ) {
01510                         $list = Sanitizer::setupAttributeWhitelist();
01511                 }
01512                 return isset( $list[$element] )
01513                         ? $list[$element]
01514                         : array();
01515         }
01516
01522         static function setupAttributeWhitelist() {
01523                 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
01524
01525                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
01526
01527                 if ( $wgAllowRdfaAttributes ) {
01528                         #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
01529                         $common = array_merge( $common, array(
01530                             'about', 'property', 'resource', 'datatype', 'typeof',
01531                         ) );
01532                 }
01533
01534                 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
01535                         # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
01536                         $common = array_merge( $common, array(
01537                             'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
01538                         ) );
01539                 }
01540
01541                 $block = array_merge( $common, array( 'align' ) );
01542                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
01543                 $tablecell = array( 'abbr',
01544                                     'axis',
01545                                     'headers',
01546                                     'scope',
01547                                     'rowspan',
01548                                     'colspan',
01549                                     'nowrap', # deprecated
01550                                     'width',  # deprecated
01551                                     'height', # deprecated
01552                                     'bgcolor' # deprecated
01553                                     );
01554
01555                 # Numbers refer to sections in HTML 4.01 standard describing the element.
01556                 # See: http://www.w3.org/TR/html4/
01557                 $whitelist = array (
01558                         # 7.5.4
01559                         'div'        => $block,
01560                         'center'     => $common, # deprecated
01561                         'span'       => $block, # ??
01562
01563                         # 7.5.5
01564                         'h1'         => $block,
01565                         'h2'         => $block,
01566                         'h3'         => $block,
01567                         'h4'         => $block,
01568                         'h5'         => $block,
01569                         'h6'         => $block,
01570
01571                         # 7.5.6
01572                         # address
01573
01574                         # 8.2.4
01575                         # bdo
01576
01577                         # 9.2.1
01578                         'em'         => $common,
01579                         'strong'     => $common,
01580                         'cite'       => $common,
01581                         'dfn'        => $common,
01582                         'code'       => $common,
01583                         'samp'       => $common,
01584                         'kbd'        => $common,
01585                         'var'        => $common,
01586                         'abbr'       => $common,
01587                         # acronym
01588
01589                         # 9.2.2
01590                         'blockquote' => array_merge( $common, array( 'cite' ) ),
01591                         # q
01592
01593                         # 9.2.3
01594                         'sub'        => $common,
01595                         'sup'        => $common,
01596
01597                         # 9.3.1
01598                         'p'          => $block,
01599
01600                         # 9.3.2
01601                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
01602
01603                         # 9.3.4
01604                         'pre'        => array_merge( $common, array( 'width' ) ),
01605
01606                         # 9.4
01607                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01608                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01609
01610                         # 10.2
01611                         'ul'         => array_merge( $common, array( 'type' ) ),
01612                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
01613                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
01614
01615                         # 10.3
01616                         'dl'         => $common,
01617                         'dd'         => $common,
01618                         'dt'         => $common,
01619
01620                         # 11.2.1
01621                         'table'      => array_merge( $common,
01622                                                                 array( 'summary', 'width', 'border', 'frame',
01623                                                                                 'rules', 'cellspacing', 'cellpadding',
01624                                                                                 'align', 'bgcolor',
01625                                                                 ) ),
01626
01627                         # 11.2.2
01628                         'caption'    => array_merge( $common, array( 'align' ) ),
01629
01630                         # 11.2.3
01631                         'thead'      => array_merge( $common, $tablealign ),
01632                         'tfoot'      => array_merge( $common, $tablealign ),
01633                         'tbody'      => array_merge( $common, $tablealign ),
01634
01635                         # 11.2.4
01636                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01637                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01638
01639                         # 11.2.5
01640                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01641
01642                         # 11.2.6
01643                         'td'         => array_merge( $common, $tablecell, $tablealign ),
01644                         'th'         => array_merge( $common, $tablecell, $tablealign ),
01645
01646                         # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
01647                         'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
01648
01649                         # 13.2
01650                         # Not usually allowed, but may be used for extension-style hooks
01651                         # such as <math> when it is rasterized, or if $wgAllowImageTag is
01652                         # true
01653                         'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
01654
01655                         # 15.2.1
01656                         'tt'         => $common,
01657                         'b'          => $common,
01658                         'i'          => $common,
01659                         'big'        => $common,
01660                         'small'      => $common,
01661                         'strike'     => $common,
01662                         's'          => $common,
01663                         'u'          => $common,
01664
01665                         # 15.2.2
01666                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
01667                         # basefont
01668
01669                         # 15.3
01670                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
01671
01672                         # XHTML Ruby annotation text module, simple ruby only.
01673                         # http://www.w3c.org/TR/ruby/
01674                         'ruby'       => $common,
01675                         # rbc
01676                         # rtc
01677                         'rb'         => $common,
01678                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
01679                         'rp'         => $common,
01680
01681                         # MathML root element, where used for extensions
01682                         # 'title' may not be 100% valid here; it's XHTML
01683                         # http://www.w3.org/TR/REC-MathML/
01684                         'math'       => array( 'class', 'style', 'id', 'title' ),
01685                         );
01686                 return $whitelist;
01687         }
01688
01699         static function stripAllTags( $text ) {
01700                 # Actual <tags>
01701                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01702
01703                 # Normalize &entities and whitespace
01704                 $text = self::decodeCharReferences( $text );
01705                 $text = self::normalizeWhitespace( $text );
01706
01707                 return $text;
01708         }
01709
01719         static function hackDocType() {
01720                 $out = "<!DOCTYPE html [\n";
01721                 foreach( self::$htmlEntities as $entity => $codepoint ) {
01722                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01723                 }
01724                 $out .= "]>\n";
01725                 return $out;
01726         }
01727
01732         static function cleanUrl( $url ) {
01733                 # Normalize any HTML entities in input. They will be
01734                 # re-escaped by makeExternalLink().
01735                 $url = Sanitizer::decodeCharReferences( $url );
01736
01737                 # Escape any control characters introduced by the above step
01738                 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
01739                         array( __CLASS__, 'cleanUrlCallback' ), $url );
01740
01741                 # Validate hostname portion
01742                 $matches = array();
01743                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01744                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
01745
01746                         // Characters that will be ignored in IDNs.
01747                         // http://tools.ietf.org/html/3454#section-3.1
01748                         // Strip them before further processing so blacklists and such work.
01749                         $strip = "/
01750                                 \\s|          # general whitespace
01751                                 \xc2\xad|     # 00ad SOFT HYPHEN
01752                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01753                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01754                                 \xe2\x81\xa0| # 2060 WORD JOINER
01755                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01756                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
01757                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01758                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01759                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01760                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01761                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01762                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
01763                                 /xuD";
01764
01765                         $host = preg_replace( $strip, '', $host );
01766
01767                         // @todo FIXME: Validate hostnames here
01768
01769                         return $protocol . $host . $rest;
01770                 } else {
01771                         return $url;
01772                 }
01773         }
01774
01779         static function cleanUrlCallback( $matches ) {
01780                 return urlencode( $matches[0] );
01781         }
01782
01811         public static function validateEmail( $addr ) {
01812                 $result = null;
01813                 if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
01814                         return $result;
01815                 }
01816
01817                 // Please note strings below are enclosed in brackets [], this make the
01818                 // hyphen "-" a range indicator. Hence it is double backslashed below.
01819                 // See bug 26948
01820                 $rfc5322_atext   = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ;
01821                 $rfc1034_ldh_str = "a-z0-9\\-" ;
01822
01823                 $HTML5_email_regexp = "/
01824                 ^                      # start of string
01825                 [$rfc5322_atext\\.]+    # user part which is liberal :p
01826                 @                      # 'apostrophe'
01827                 [$rfc1034_ldh_str]+       # First domain part
01828                 (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
01829                 $                      # End of string
01830                 /ix" ; // case Insensitive, eXtended
01831
01832                 return (bool) preg_match( $HTML5_email_regexp, $addr );
01833         }
01834 }