php/html/Sanitizer_8php_source.html

00001 <?php
00031 class Sanitizer {
00036         const CHAR_REFS_REGEX =
00037                 '/&([A-Za-z0-9\x80-\xff]+);
00038                  |&\#([0-9]+);
00039                  |&\#[xX]([0-9A-Fa-f]+);
00040                  |(&)/x';
00041
00050         const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
00051         const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
00052
00059         static $htmlEntities = array(
00060                 'Aacute'   => 193,
00061                 'aacute'   => 225,
00062                 'Acirc'    => 194,
00063                 'acirc'    => 226,
00064                 'acute'    => 180,
00065                 'AElig'    => 198,
00066                 'aelig'    => 230,
00067                 'Agrave'   => 192,
00068                 'agrave'   => 224,
00069                 'alefsym'  => 8501,
00070                 'Alpha'    => 913,
00071                 'alpha'    => 945,
00072                 'amp'      => 38,
00073                 'and'      => 8743,
00074                 'ang'      => 8736,
00075                 'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
00076                 'Aring'    => 197,
00077                 'aring'    => 229,
00078                 'asymp'    => 8776,
00079                 'Atilde'   => 195,
00080                 'atilde'   => 227,
00081                 'Auml'     => 196,
00082                 'auml'     => 228,
00083                 'bdquo'    => 8222,
00084                 'Beta'     => 914,
00085                 'beta'     => 946,
00086                 'brvbar'   => 166,
00087                 'bull'     => 8226,
00088                 'cap'      => 8745,
00089                 'Ccedil'   => 199,
00090                 'ccedil'   => 231,
00091                 'cedil'    => 184,
00092                 'cent'     => 162,
00093                 'Chi'      => 935,
00094                 'chi'      => 967,
00095                 'circ'     => 710,
00096                 'clubs'    => 9827,
00097                 'cong'     => 8773,
00098                 'copy'     => 169,
00099                 'crarr'    => 8629,
00100                 'cup'      => 8746,
00101                 'curren'   => 164,
00102                 'dagger'   => 8224,
00103                 'Dagger'   => 8225,
00104                 'darr'     => 8595,
00105                 'dArr'     => 8659,
00106                 'deg'      => 176,
00107                 'Delta'    => 916,
00108                 'delta'    => 948,
00109                 'diams'    => 9830,
00110                 'divide'   => 247,
00111                 'Eacute'   => 201,
00112                 'eacute'   => 233,
00113                 'Ecirc'    => 202,
00114                 'ecirc'    => 234,
00115                 'Egrave'   => 200,
00116                 'egrave'   => 232,
00117                 'empty'    => 8709,
00118                 'emsp'     => 8195,
00119                 'ensp'     => 8194,
00120                 'Epsilon'  => 917,
00121                 'epsilon'  => 949,
00122                 'equiv'    => 8801,
00123                 'Eta'      => 919,
00124                 'eta'      => 951,
00125                 'ETH'      => 208,
00126                 'eth'      => 240,
00127                 'Euml'     => 203,
00128                 'euml'     => 235,
00129                 'euro'     => 8364,
00130                 'exist'    => 8707,
00131                 'fnof'     => 402,
00132                 'forall'   => 8704,
00133                 'frac12'   => 189,
00134                 'frac14'   => 188,
00135                 'frac34'   => 190,
00136                 'frasl'    => 8260,
00137                 'Gamma'    => 915,
00138                 'gamma'    => 947,
00139                 'ge'       => 8805,
00140                 'gt'       => 62,
00141                 'harr'     => 8596,
00142                 'hArr'     => 8660,
00143                 'hearts'   => 9829,
00144                 'hellip'   => 8230,
00145                 'Iacute'   => 205,
00146                 'iacute'   => 237,
00147                 'Icirc'    => 206,
00148                 'icirc'    => 238,
00149                 'iexcl'    => 161,
00150                 'Igrave'   => 204,
00151                 'igrave'   => 236,
00152                 'image'    => 8465,
00153                 'infin'    => 8734,
00154                 'int'      => 8747,
00155                 'Iota'     => 921,
00156                 'iota'     => 953,
00157                 'iquest'   => 191,
00158                 'isin'     => 8712,
00159                 'Iuml'     => 207,
00160                 'iuml'     => 239,
00161                 'Kappa'    => 922,
00162                 'kappa'    => 954,
00163                 'Lambda'   => 923,
00164                 'lambda'   => 955,
00165                 'lang'     => 9001,
00166                 'laquo'    => 171,
00167                 'larr'     => 8592,
00168                 'lArr'     => 8656,
00169                 'lceil'    => 8968,
00170                 'ldquo'    => 8220,
00171                 'le'       => 8804,
00172                 'lfloor'   => 8970,
00173                 'lowast'   => 8727,
00174                 'loz'      => 9674,
00175                 'lrm'      => 8206,
00176                 'lsaquo'   => 8249,
00177                 'lsquo'    => 8216,
00178                 'lt'       => 60,
00179                 'macr'     => 175,
00180                 'mdash'    => 8212,
00181                 'micro'    => 181,
00182                 'middot'   => 183,
00183                 'minus'    => 8722,
00184                 'Mu'       => 924,
00185                 'mu'       => 956,
00186                 'nabla'    => 8711,
00187                 'nbsp'     => 160,
00188                 'ndash'    => 8211,
00189                 'ne'       => 8800,
00190                 'ni'       => 8715,
00191                 'not'      => 172,
00192                 'notin'    => 8713,
00193                 'nsub'     => 8836,
00194                 'Ntilde'   => 209,
00195                 'ntilde'   => 241,
00196                 'Nu'       => 925,
00197                 'nu'       => 957,
00198                 'Oacute'   => 211,
00199                 'oacute'   => 243,
00200                 'Ocirc'    => 212,
00201                 'ocirc'    => 244,
00202                 'OElig'    => 338,
00203                 'oelig'    => 339,
00204                 'Ograve'   => 210,
00205                 'ograve'   => 242,
00206                 'oline'    => 8254,
00207                 'Omega'    => 937,
00208                 'omega'    => 969,
00209                 'Omicron'  => 927,
00210                 'omicron'  => 959,
00211                 'oplus'    => 8853,
00212                 'or'       => 8744,
00213                 'ordf'     => 170,
00214                 'ordm'     => 186,
00215                 'Oslash'   => 216,
00216                 'oslash'   => 248,
00217                 'Otilde'   => 213,
00218                 'otilde'   => 245,
00219                 'otimes'   => 8855,
00220                 'Ouml'     => 214,
00221                 'ouml'     => 246,
00222                 'para'     => 182,
00223                 'part'     => 8706,
00224                 'permil'   => 8240,
00225                 'perp'     => 8869,
00226                 'Phi'      => 934,
00227                 'phi'      => 966,
00228                 'Pi'       => 928,
00229                 'pi'       => 960,
00230                 'piv'      => 982,
00231                 'plusmn'   => 177,
00232                 'pound'    => 163,
00233                 'prime'    => 8242,
00234                 'Prime'    => 8243,
00235                 'prod'     => 8719,
00236                 'prop'     => 8733,
00237                 'Psi'      => 936,
00238                 'psi'      => 968,
00239                 'quot'     => 34,
00240                 'radic'    => 8730,
00241                 'rang'     => 9002,
00242                 'raquo'    => 187,
00243                 'rarr'     => 8594,
00244                 'rArr'     => 8658,
00245                 'rceil'    => 8969,
00246                 'rdquo'    => 8221,
00247                 'real'     => 8476,
00248                 'reg'      => 174,
00249                 'rfloor'   => 8971,
00250                 'Rho'      => 929,
00251                 'rho'      => 961,
00252                 'rlm'      => 8207,
00253                 'rsaquo'   => 8250,
00254                 'rsquo'    => 8217,
00255                 'sbquo'    => 8218,
00256                 'Scaron'   => 352,
00257                 'scaron'   => 353,
00258                 'sdot'     => 8901,
00259                 'sect'     => 167,
00260                 'shy'      => 173,
00261                 'Sigma'    => 931,
00262                 'sigma'    => 963,
00263                 'sigmaf'   => 962,
00264                 'sim'      => 8764,
00265                 'spades'   => 9824,
00266                 'sub'      => 8834,
00267                 'sube'     => 8838,
00268                 'sum'      => 8721,
00269                 'sup'      => 8835,
00270                 'sup1'     => 185,
00271                 'sup2'     => 178,
00272                 'sup3'     => 179,
00273                 'supe'     => 8839,
00274                 'szlig'    => 223,
00275                 'Tau'      => 932,
00276                 'tau'      => 964,
00277                 'there4'   => 8756,
00278                 'Theta'    => 920,
00279                 'theta'    => 952,
00280                 'thetasym' => 977,
00281                 'thinsp'   => 8201,
00282                 'THORN'    => 222,
00283                 'thorn'    => 254,
00284                 'tilde'    => 732,
00285                 'times'    => 215,
00286                 'trade'    => 8482,
00287                 'Uacute'   => 218,
00288                 'uacute'   => 250,
00289                 'uarr'     => 8593,
00290                 'uArr'     => 8657,
00291                 'Ucirc'    => 219,
00292                 'ucirc'    => 251,
00293                 'Ugrave'   => 217,
00294                 'ugrave'   => 249,
00295                 'uml'      => 168,
00296                 'upsih'    => 978,
00297                 'Upsilon'  => 933,
00298                 'upsilon'  => 965,
00299                 'Uuml'     => 220,
00300                 'uuml'     => 252,
00301                 'weierp'   => 8472,
00302                 'Xi'       => 926,
00303                 'xi'       => 958,
00304                 'Yacute'   => 221,
00305                 'yacute'   => 253,
00306                 'yen'      => 165,
00307                 'Yuml'     => 376,
00308                 'yuml'     => 255,
00309                 'Zeta'     => 918,
00310                 'zeta'     => 950,
00311                 'zwj'      => 8205,
00312                 'zwnj'     => 8204
00313         );
00314
00318         static $htmlEntityAliases = array(
00319                 'רלמ' => 'rlm',
00320                 'رلم' => 'rlm',
00321         );
00322
00326         static $attribsRegex;
00327
00333         static function getAttribsRegex() {
00334                 if ( self::$attribsRegex === null ) {
00335                         $attribFirst = '[:A-Z_a-z0-9]';
00336                         $attrib = '[:A-Z_a-z-.0-9]';
00337                         $space = '[\x09\x0a\x0d\x20]';
00338                         self::$attribsRegex =
00339                                 "/(?:^|$space)({$attribFirst}{$attrib}*)
00340                                   ($space*=$space*
00341                                         (?:
00342                                          # The attribute value: quoted or alone
00343                                           \"([^<\"]*)\"
00344                                          | '([^<']*)'
00345                                          |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00346                                          |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00347                                                                                  # colors are specified like this.
00348                                                                                  # We'll be normalizing it.
00349                                         )
00350                                 )?(?=$space|\$)/sx";
00351                 }
00352                 return self::$attribsRegex;
00353         }
00354
00366         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
00367                 global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag;
00368
00369                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00370                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
00371
00372                 wfProfileIn( __METHOD__ );
00373
00374                 // Base our staticInitialised variable off of the global config state so that if the globals
00375                 // are changed (like in the screwed up test system) we will re-initialise the settings.
00376                 $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
00377                 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
00378
00379                         $htmlpairsStatic = array( # Tags that must be closed
00380                                 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00381                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00382                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
00383                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00384                                 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
00385                                 'kbd', 'samp'
00386                         );
00387                         if ( $wgHtml5 ) {
00388                                 $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) );
00389                         }
00390                         $htmlsingle = array(
00391                                 'br', 'hr', 'li', 'dt', 'dd'
00392                         );
00393                         $htmlsingleonly = array( # Elements that cannot have close tags
00394                                 'br', 'hr'
00395                         );
00396                         if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
00397                                 $htmlsingle[] = $htmlsingleonly[] = 'meta';
00398                                 $htmlsingle[] = $htmlsingleonly[] = 'link';
00399                         }
00400                         $htmlnest = array( # Tags that can be nested--??
00401                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00402                                 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span'
00403                         );
00404                         $tabletags = array( # Can only appear inside table, we will close them
00405                                 'td', 'th', 'tr',
00406                         );
00407                         $htmllist = array( # Tags used by list
00408                                 'ul','ol',
00409                         );
00410                         $listtags = array( # Tags that can appear in a list
00411                                 'li',
00412                         );
00413
00414                         if ( $wgAllowImageTag ) {
00415                                 $htmlsingle[] = 'img';
00416                                 $htmlsingleonly[] = 'img';
00417                         }
00418
00419                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
00420                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
00421
00422                         # Convert them all to hashtables for faster lookup
00423                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00424                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
00425                         foreach ( $vars as $var ) {
00426                                 $$var = array_flip( $$var );
00427                         }
00428                         $staticInitialised = $globalContext;
00429                 }
00430                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
00431                 $extratags = array_flip( $extratags );
00432                 $removetags = array_flip( $removetags );
00433                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
00434                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
00435
00436                 # Remove HTML comments
00437                 $text = Sanitizer::removeHTMLcomments( $text );
00438                 $bits = explode( '<', $text );
00439                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
00440                 if ( !$wgUseTidy ) {
00441                         $tagstack = $tablestack = array();
00442                         foreach ( $bits as $x ) {
00443                                 $regs = array();
00444                                 # $slash: Does the current element start with a '/'?
00445                                 # $t: Current element name
00446                                 # $params: String between element name and >
00447                                 # $brace: Ending '>' or '/>'
00448                                 # $rest: Everything until the next element of $bits
00449                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00450                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00451                                 } else {
00452                                         $slash = $t = $params = $brace = $rest = null;
00453                                 }
00454
00455                                 $badtag = false;
00456                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00457                                         # Check our stack
00458                                         if ( $slash && isset( $htmlsingleonly[$t] ) ) {
00459                                                 $badtag = true;
00460                                         } elseif ( $slash ) {
00461                                                 # Closing a tag... is it the one we just opened?
00462                                                 $ot = @array_pop( $tagstack );
00463                                                 if ( $ot != $t ) {
00464                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
00465                                                                 # Pop all elements with an optional close tag
00466                                                                 # and see if we find a match below them
00467                                                                 $optstack = array();
00468                                                                 array_push( $optstack, $ot );
00469                                                                 wfSuppressWarnings();
00470                                                                 $ot = array_pop( $tagstack );
00471                                                                 wfRestoreWarnings();
00472                                                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
00473                                                                         array_push( $optstack, $ot );
00474                                                                         wfSuppressWarnings();
00475                                                                         $ot = array_pop( $tagstack );
00476                                                                         wfRestoreWarnings();
00477                                                                 }
00478                                                                 if ( $t != $ot ) {
00479                                                                         # No match. Push the optional elements back again
00480                                                                         $badtag = true;
00481                                                                         wfSuppressWarnings();
00482                                                                         $ot = array_pop( $optstack );
00483                                                                         wfRestoreWarnings();
00484                                                                         while ( $ot ) {
00485                                                                                 array_push( $tagstack, $ot );
00486                                                                                 wfSuppressWarnings();
00487                                                                                 $ot = array_pop( $optstack );
00488                                                                                 wfRestoreWarnings();
00489                                                                         }
00490                                                                 }
00491                                                         } else {
00492                                                                 @array_push( $tagstack, $ot );
00493                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
00494                                                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
00495                                                                         $badtag = true;
00496                                                                 }
00497                                                         }
00498                                                 } else {
00499                                                         if ( $t == 'table' ) {
00500                                                                 $tagstack = array_pop( $tablestack );
00501                                                         }
00502                                                 }
00503                                                 $newparams = '';
00504                                         } else {
00505                                                 # Keep track for later
00506                                                 if ( isset( $tabletags[$t] ) &&
00507                                                 !in_array( 'table', $tagstack ) ) {
00508                                                         $badtag = true;
00509                                                 } elseif ( in_array( $t, $tagstack ) &&
00510                                                 !isset( $htmlnest [$t ] ) ) {
00511                                                         $badtag = true;
00512                                                 # Is it a self closed htmlpair ? (bug 5487)
00513                                                 } elseif ( $brace == '/>' &&
00514                                                 isset( $htmlpairs[$t] ) ) {
00515                                                         $badtag = true;
00516                                                 } elseif ( isset( $htmlsingleonly[$t] ) ) {
00517                                                         # Hack to force empty tag for unclosable elements
00518                                                         $brace = '/>';
00519                                                 } elseif ( isset( $htmlsingle[$t] ) ) {
00520                                                         # Hack to not close $htmlsingle tags
00521                                                         $brace = null;
00522                                                         # Still need to push this optionally-closed tag to
00523                                                         # the tag stack so that we can match end tags
00524                                                         # instead of marking them as bad.
00525                                                         array_push( $tagstack, $t );
00526                                                 } elseif ( isset( $tabletags[$t] )
00527                                                 && in_array( $t, $tagstack ) ) {
00528                                                         // New table tag but forgot to close the previous one
00529                                                         $text .= "</$t>";
00530                                                 } else {
00531                                                         if ( $t == 'table' ) {
00532                                                                 array_push( $tablestack, $tagstack );
00533                                                                 $tagstack = array();
00534                                                         }
00535                                                         array_push( $tagstack, $t );
00536                                                 }
00537
00538                                                 # Replace any variables or template parameters with
00539                                                 # plaintext results.
00540                                                 if( is_callable( $processCallback ) ) {
00541                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
00542                                                 }
00543
00544                                                 if ( !Sanitizer::validateTag( $params, $t ) ) {
00545                                                         $badtag = true;
00546                                                 }
00547
00548                                                 # Strip non-approved attributes from the tag
00549                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00550                                         }
00551                                         if ( !$badtag ) {
00552                                                 $rest = str_replace( '>', '&gt;', $rest );
00553                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00554                                                 $text .= "<$slash$t$newparams$close>$rest";
00555                                                 continue;
00556                                         }
00557                                 }
00558                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00559                         }
00560                         # Close off any remaining tags
00561                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
00562                                 $text .= "</$t>\n";
00563                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
00564                         }
00565                 } else {
00566                         # this might be possible using tidy itself
00567                         foreach ( $bits as $x ) {
00568                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00569                                 $x, $regs );
00570                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00571                                 $badtag = false;
00572                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00573                                         if( is_callable( $processCallback ) ) {
00574                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
00575                                         }
00576
00577                                         if ( !Sanitizer::validateTag( $params, $t ) ) {
00578                                                 $badtag = true;
00579                                         }
00580
00581                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
00582                                         if ( !$badtag ) {
00583                                                 $rest = str_replace( '>', '&gt;', $rest );
00584                                                 $text .= "<$slash$t$newparams$brace$rest";
00585                                                 continue;
00586                                         }
00587                                 }
00588                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00589                         }
00590                 }
00591                 wfProfileOut( __METHOD__ );
00592                 return $text;
00593         }
00594
00605         static function removeHTMLcomments( $text ) {
00606                 wfProfileIn( __METHOD__ );
00607                 while ( ($start = strpos( $text, '<!--' ) ) !== false ) {
00608                         $end = strpos( $text, '-->', $start + 4 );
00609                         if ( $end === false ) {
00610                                 # Unterminated comment; bail out
00611                                 break;
00612                         }
00613
00614                         $end += 3;
00615
00616                         # Trim space and newline if the comment is both
00617                         # preceded and followed by a newline
00618                         $spaceStart = max( $start - 1, 0 );
00619                         $spaceLen = $end - $spaceStart;
00620                         while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
00621                                 $spaceStart--;
00622                                 $spaceLen++;
00623                         }
00624                         while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' )
00625                                 $spaceLen++;
00626                         if ( substr( $text, $spaceStart, 1 ) === "\n" and substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
00627                                 # Remove the comment, leading and trailing
00628                                 # spaces, and leave only one newline.
00629                                 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
00630                         }
00631                         else {
00632                                 # Remove just the comment.
00633                                 $text = substr_replace( $text, '', $start, $end - $start );
00634                         }
00635                 }
00636                 wfProfileOut( __METHOD__ );
00637                 return $text;
00638         }
00639
00652         static function validateTag( $params, $element ) {
00653                 $params = Sanitizer::decodeTagAttributes( $params );
00654
00655                 if ( $element == 'meta' || $element == 'link' ) {
00656                         if ( !isset( $params['itemprop'] ) ) {
00657                                 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
00658                                 return false;
00659                         }
00660                         if ( $element == 'meta' && !isset( $params['content'] ) ) {
00661                                 // <meta> must have a content="" for the itemprop
00662                                 return false;
00663                         }
00664                         if ( $element == 'link' && !isset( $params['href'] ) ) {
00665                                 // <link> must have an associated href=""
00666                                 return false;
00667                         }
00668                 }
00669
00670                 return true;
00671         }
00672
00688         static function validateTagAttributes( $attribs, $element ) {
00689                 return Sanitizer::validateAttributes( $attribs,
00690                         Sanitizer::attributeWhitelist( $element ) );
00691         }
00692
00708         static function validateAttributes( $attribs, $whitelist ) {
00709                 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
00710
00711                 $whitelist = array_flip( $whitelist );
00712                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
00713
00714                 $out = array();
00715                 foreach( $attribs as $attribute => $value ) {
00716                         #allow XML namespace declaration if RDFa is enabled
00717                         if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
00718                                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00719                                         $out[$attribute] = $value;
00720                                 }
00721
00722                                 continue;
00723                         }
00724
00725                         # Allow any attribute beginning with "data-", if in HTML5 mode
00726                         if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
00727                                 continue;
00728                         }
00729
00730                         # Strip javascript "expression" from stylesheets.
00731                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00732                         if( $attribute == 'style' ) {
00733                                 $value = Sanitizer::checkCss( $value );
00734                         }
00735
00736                         if ( $attribute === 'id' ) {
00737                                 $value = Sanitizer::escapeId( $value, 'noninitial' );
00738                         }
00739
00740                         # WAI-ARIA
00741                         # http://www.w3.org/TR/wai-aria/
00742                         # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#wai-aria
00743                         # For now we only support role="presentation" until we work out what roles should be
00744                         # usable by content and we ensure that our code explicitly rejects patterns that
00745                         # violate HTML5's ARIA restrictions.
00746                         if ( $attribute === 'role' && $value !== 'presentation' ) {
00747                                 continue;
00748                         }
00749
00750                         //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
00751                         if ( $attribute === 'rel' || $attribute === 'rev' ||
00752                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
00753                                 $attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
00754                                 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
00755                                 $attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
00756
00757                                 //Paranoia. Allow "simple" values but suppress javascript
00758                                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00759                                         continue;
00760                                 }
00761                         }
00762
00763                         # NOTE: even though elements using href/src are not allowed directly, supply
00764                         #       validation code that can be used by tag hook handlers, etc
00765                         if ( $attribute === 'href' || $attribute === 'src' ) {
00766                                 if ( !preg_match( $hrefExp, $value ) ) {
00767                                         continue; //drop any href or src attributes not using an allowed protocol.
00768                                                   //NOTE: this also drops all relative URLs
00769                                 }
00770                         }
00771
00772                         // If this attribute was previously set, override it.
00773                         // Output should only have one attribute of each name.
00774                         $out[$attribute] = $value;
00775                 }
00776
00777                 if ( $wgAllowMicrodataAttributes ) {
00778                         # itemtype, itemid, itemref don't make sense without itemscope
00779                         if ( !array_key_exists( 'itemscope', $out ) ) {
00780                                 unset( $out['itemtype'] );
00781                                 unset( $out['itemid'] );
00782                                 unset( $out['itemref'] );
00783                         }
00784                         # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
00785                 }
00786                 return $out;
00787         }
00788
00799         static function mergeAttributes( $a, $b ) {
00800                 $out = array_merge( $a, $b );
00801                 if( isset( $a['class'] ) && isset( $b['class'] )
00802                 && is_string( $a['class'] ) && is_string( $b['class'] )
00803                 && $a['class'] !== $b['class'] ) {
00804                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00805                                 -1, PREG_SPLIT_NO_EMPTY );
00806                         $out['class'] = implode( ' ', array_unique( $classes ) );
00807                 }
00808                 return $out;
00809         }
00810
00828         static function checkCss( $value ) {
00829                 // Decode character references like &#123;
00830                 $value = Sanitizer::decodeCharReferences( $value );
00831
00832                 // Decode escape sequences and line continuation
00833                 // See the grammar in the CSS 2 spec, appendix D.
00834                 // This has to be done AFTER decoding character references.
00835                 // This means it isn't possible for this function to return
00836                 // unsanitized escape sequences. It is possible to manufacture
00837                 // input that contains character references that decode to
00838                 // escape sequences that decode to character references, but
00839                 // it's OK for the return value to contain character references
00840                 // because the caller is supposed to escape those anyway.
00841                 static $decodeRegex;
00842                 if ( !$decodeRegex ) {
00843                         $space = '[\\x20\\t\\r\\n\\f]';
00844                         $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00845                         $backslash = '\\\\';
00846                         $decodeRegex = "/ $backslash
00847                                 (?:
00848                                         ($nl) |  # 1. Line continuation
00849                                         ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
00850                                         (.) | # 3. backslash cancelling special meaning
00851                                         () | # 4. backslash at end of string
00852                                 )/xu";
00853                 }
00854                 $value = preg_replace_callback( $decodeRegex,
00855                         array( __CLASS__, 'cssDecodeCallback' ), $value );
00856
00857                 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
00858                 $value = preg_replace_callback(
00859                         '/[！-［］-ｚ]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
00860                         function ( $matches ) {
00861                                 $cp = utf8ToCodepoint( $matches[0] );
00862                                 if ( $cp === false ) {
00863                                         return '';
00864                                 }
00865                                 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
00866                         },
00867                         $value
00868                 );
00869
00870                 // Convert more characters IE6 might treat as ascii
00871                 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
00872                 $value = str_replace(
00873                         array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
00874                         array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
00875                         $value
00876                 );
00877
00878                 // Remove any comments; IE gets token splitting wrong
00879                 // This must be done AFTER decoding character references and
00880                 // escape sequences, because those steps can introduce comments
00881                 // This step cannot introduce character references or escape
00882                 // sequences, because it replaces comments with spaces rather
00883                 // than removing them completely.
00884                 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00885
00886                 // Remove anything after a comment-start token, to guard against
00887                 // incorrect client implementations.
00888                 $commentPos = strpos( $value, '/*' );
00889                 if ( $commentPos !== false ) {
00890                         $value = substr( $value, 0, $commentPos );
00891                 }
00892
00893                 // S followed by repeat, iteration, or prolonged sound marks,
00894                 // which IE will treat as "ss"
00895                 $value = preg_replace(
00896                         '/s(?:
00897                                 \xE3\x80\xB1 | # U+3031
00898                                 \xE3\x82\x9D | # U+309D
00899                                 \xE3\x83\xBC | # U+30FC
00900                                 \xE3\x83\xBD | # U+30FD
00901                                 \xEF\xB9\xBC | # U+FE7C
00902                                 \xEF\xB9\xBD | # U+FE7D
00903                                 \xEF\xBD\xB0   # U+FF70
00904                         )/ix',
00905                         'ss',
00906                         $value
00907                 );
00908
00909                 // Reject problematic keywords and control characters
00910                 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
00911                         return '/* invalid control char */';
00912                 } elseif ( preg_match(
00913                         '! expression
00914                                 | filter\s*:
00915                                 | accelerator\s*:
00916                                 | -o-link\s*:
00917                                 | -o-link-source\s*:
00918                                 | -o-replace\s*:
00919                                 | url\s*\(
00920                                 | image\s*\(
00921                                 | image-set\s*\(
00922                         !ix', $value ) ) {
00923                         return '/* insecure input */';
00924                 }
00925                 return $value;
00926         }
00927
00932         static function cssDecodeCallback( $matches ) {
00933                 if ( $matches[1] !== '' ) {
00934                         // Line continuation
00935                         return '';
00936                 } elseif ( $matches[2] !== '' ) {
00937                         $char = codepointToUtf8( hexdec( $matches[2] ) );
00938                 } elseif ( $matches[3] !== '' ) {
00939                         $char = $matches[3];
00940                 } else {
00941                         $char = '\\';
00942                 }
00943                 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00944                         // These characters need to be escaped in strings
00945                         // Clean up the escape sequence to avoid parsing errors by clients
00946                         return '\\' . dechex( ord( $char ) ) . ' ';
00947                 } else {
00948                         // Decode unnecessary escape
00949                         return $char;
00950                 }
00951         }
00952
00972         static function fixTagAttributes( $text, $element ) {
00973                 if( trim( $text ) == '' ) {
00974                         return '';
00975                 }
00976
00977                 $decoded = Sanitizer::decodeTagAttributes( $text );
00978                 $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
00979
00980                 $attribs = array();
00981                 foreach( $stripped as $attribute => $value ) {
00982                         $encAttribute = htmlspecialchars( $attribute );
00983                         $encValue = Sanitizer::safeEncodeAttribute( $value );
00984
00985                         $attribs[] = "$encAttribute=\"$encValue\"";
00986                 }
00987                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
00988         }
00989
00995         static function encodeAttribute( $text ) {
00996                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
00997
00998                 // Whitespace is normalized during attribute decoding,
00999                 // so if we've been passed non-spaces we must encode them
01000                 // ahead of time or they won't be preserved.
01001                 $encValue = strtr( $encValue, array(
01002                         "\n" => '&#10;',
01003                         "\r" => '&#13;',
01004                         "\t" => '&#9;',
01005                 ) );
01006
01007                 return $encValue;
01008         }
01009
01016         static function safeEncodeAttribute( $text ) {
01017                 $encValue = Sanitizer::encodeAttribute( $text );
01018
01019                 # Templates and links may be expanded in later parsing,
01020                 # creating invalid or dangerous output. Suppress this.
01021                 $encValue = strtr( $encValue, array(
01022                         '<'    => '&lt;',   // This should never happen,
01023                         '>'    => '&gt;',   // we've received invalid input
01024                         '"'    => '&quot;', // which should have been escaped.
01025                         '{'    => '&#123;',
01026                         '['    => '&#91;',
01027                         "''"   => '&#39;&#39;',
01028                         'ISBN' => '&#73;SBN',
01029                         'RFC'  => '&#82;FC',
01030                         'PMID' => '&#80;MID',
01031                         '|'    => '&#124;',
01032                         '__'   => '&#95;_',
01033                 ) );
01034
01035                 # Stupid hack
01036                 $encValue = preg_replace_callback(
01037                         '/((?i)' . wfUrlProtocols() . ')/',
01038                         array( 'Sanitizer', 'armorLinksCallback' ),
01039                         $encValue );
01040                 return $encValue;
01041         }
01042
01074         static function escapeId( $id, $options = array() ) {
01075                 global $wgHtml5, $wgExperimentalHtmlIds;
01076                 $options = (array)$options;
01077
01078                 if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
01079                         $id = Sanitizer::decodeCharReferences( $id );
01080                         $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
01081                         $id = trim( $id, '_' );
01082                         if ( $id === '' ) {
01083                                 # Must have been all whitespace to start with.
01084                                 return '_';
01085                         } else {
01086                                 return $id;
01087                         }
01088                 }
01089
01090                 # HTML4-style escaping
01091                 static $replace = array(
01092                         '%3A' => ':',
01093                         '%' => '.'
01094                 );
01095
01096                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
01097                 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
01098
01099                 if ( !preg_match( '/^[a-zA-Z]/', $id )
01100                 && !in_array( 'noninitial', $options ) ) {
01101                         // Initial character must be a letter!
01102                         $id = "x$id";
01103                 }
01104                 return $id;
01105         }
01106
01118         static function escapeClass( $class ) {
01119                 // Convert ugly stuff to underscores and kill underscores in ugly places
01120                 return rtrim( preg_replace(
01121                         array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
01122                         '_',
01123                         $class ), '_' );
01124         }
01125
01133         static function escapeHtmlAllowEntities( $html ) {
01134                 $html = Sanitizer::decodeCharReferences( $html );
01135                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
01136                 # hurt.
01137                 $html = htmlspecialchars( $html, ENT_QUOTES );
01138                 return $html;
01139         }
01140
01146         private static function armorLinksCallback( $matches ) {
01147                 return str_replace( ':', '&#58;', $matches[1] );
01148         }
01149
01158         public static function decodeTagAttributes( $text ) {
01159                 if( trim( $text ) == '' ) {
01160                         return array();
01161                 }
01162
01163                 $attribs = array();
01164                 $pairs = array();
01165                 if( !preg_match_all(
01166                         self::getAttribsRegex(),
01167                         $text,
01168                         $pairs,
01169                         PREG_SET_ORDER ) ) {
01170                         return $attribs;
01171                 }
01172
01173                 foreach( $pairs as $set ) {
01174                         $attribute = strtolower( $set[1] );
01175                         $value = Sanitizer::getTagAttributeCallback( $set );
01176
01177                         // Normalize whitespace
01178                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
01179                         $value = trim( $value );
01180
01181                         // Decode character references
01182                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
01183                 }
01184                 return $attribs;
01185         }
01186
01195         private static function getTagAttributeCallback( $set ) {
01196                 if( isset( $set[6] ) ) {
01197                         # Illegal #XXXXXX color with no quotes.
01198                         return $set[6];
01199                 } elseif( isset( $set[5] ) ) {
01200                         # No quotes.
01201                         return $set[5];
01202                 } elseif( isset( $set[4] ) ) {
01203                         # Single-quoted
01204                         return $set[4];
01205                 } elseif( isset( $set[3] ) ) {
01206                         # Double-quoted
01207                         return $set[3];
01208                 } elseif( !isset( $set[2] ) ) {
01209                         # In XHTML, attributes must have a value.
01210                         # For 'reduced' form, return explicitly the attribute name here.
01211                         return $set[1];
01212                 } else {
01213                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
01214                 }
01215         }
01216
01228         private static function normalizeAttributeValue( $text ) {
01229                 return str_replace( '"', '&quot;',
01230                         self::normalizeWhitespace(
01231                                 Sanitizer::normalizeCharReferences( $text ) ) );
01232         }
01233
01238         private static function normalizeWhitespace( $text ) {
01239                 return preg_replace(
01240                         '/\r\n|[\x20\x0d\x0a\x09]/',
01241                         ' ',
01242                         $text );
01243         }
01244
01253         static function normalizeSectionNameWhitespace( $section ) {
01254                 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
01255         }
01256
01272         static function normalizeCharReferences( $text ) {
01273                 return preg_replace_callback(
01274                         self::CHAR_REFS_REGEX,
01275                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01276                         $text );
01277         }
01282         static function normalizeCharReferencesCallback( $matches ) {
01283                 $ret = null;
01284                 if( $matches[1] != '' ) {
01285                         $ret = Sanitizer::normalizeEntity( $matches[1] );
01286                 } elseif( $matches[2] != '' ) {
01287                         $ret = Sanitizer::decCharReference( $matches[2] );
01288                 } elseif( $matches[3] != '' ) {
01289                         $ret = Sanitizer::hexCharReference( $matches[3] );
01290                 }
01291                 if( is_null( $ret ) ) {
01292                         return htmlspecialchars( $matches[0] );
01293                 } else {
01294                         return $ret;
01295                 }
01296         }
01297
01308         static function normalizeEntity( $name ) {
01309                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
01310                         return '&' . self::$htmlEntityAliases[$name] . ';';
01311                 } elseif ( in_array( $name,
01312                 array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
01313                         return "&$name;";
01314                 } elseif ( isset( self::$htmlEntities[$name] ) ) {
01315                         return '&#' . self::$htmlEntities[$name] . ';';
01316                 } else {
01317                         return "&amp;$name;";
01318                 }
01319         }
01320
01325         static function decCharReference( $codepoint ) {
01326                 $point = intval( $codepoint );
01327                 if( Sanitizer::validateCodepoint( $point ) ) {
01328                         return sprintf( '&#%d;', $point );
01329                 } else {
01330                         return null;
01331                 }
01332         }
01333
01338         static function hexCharReference( $codepoint ) {
01339                 $point = hexdec( $codepoint );
01340                 if( Sanitizer::validateCodepoint( $point ) ) {
01341                         return sprintf( '&#x%x;', $point );
01342                 } else {
01343                         return null;
01344                 }
01345         }
01346
01352         private static function validateCodepoint( $codepoint ) {
01353                 return ($codepoint ==    0x09)
01354                         || ($codepoint ==    0x0a)
01355                         || ($codepoint ==    0x0d)
01356                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
01357                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
01358                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
01359         }
01360
01368         public static function decodeCharReferences( $text ) {
01369                 return preg_replace_callback(
01370                         self::CHAR_REFS_REGEX,
01371                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01372                         $text );
01373         }
01374
01385         public static function decodeCharReferencesAndNormalize( $text ) {
01386                 global $wgContLang;
01387                 $text = preg_replace_callback(
01388                         self::CHAR_REFS_REGEX,
01389                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01390                         $text, /* limit */ -1, $count );
01391
01392                 if ( $count ) {
01393                         return $wgContLang->normalize( $text );
01394                 } else {
01395                         return $text;
01396                 }
01397         }
01398
01403         static function decodeCharReferencesCallback( $matches ) {
01404                 if( $matches[1] != '' ) {
01405                         return Sanitizer::decodeEntity( $matches[1] );
01406                 } elseif( $matches[2] != '' ) {
01407                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
01408                 } elseif( $matches[3] != '' ) {
01409                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
01410                 }
01411                 # Last case should be an ampersand by itself
01412                 return $matches[0];
01413         }
01414
01422         static function decodeChar( $codepoint ) {
01423                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
01424                         return codepointToUtf8( $codepoint );
01425                 } else {
01426                         return UTF8_REPLACEMENT;
01427                 }
01428         }
01429
01438         static function decodeEntity( $name ) {
01439                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
01440                         $name = self::$htmlEntityAliases[$name];
01441                 }
01442                 if( isset( self::$htmlEntities[$name] ) ) {
01443                         return codepointToUtf8( self::$htmlEntities[$name] );
01444                 } else {
01445                         return "&$name;";
01446                 }
01447         }
01448
01455         static function attributeWhitelist( $element ) {
01456                 $list = Sanitizer::setupAttributeWhitelist();
01457                 return isset( $list[$element] )
01458                         ? $list[$element]
01459                         : array();
01460         }
01461
01467         static function setupAttributeWhitelist() {
01468                 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
01469
01470                 static $whitelist, $staticInitialised;
01471                 $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) );
01472
01473                 if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
01474                         return $whitelist;
01475                 }
01476
01477                 $common = array(
01478                         # HTML
01479                         'id',
01480                         'class',
01481                         'style',
01482                         'lang',
01483                         'dir',
01484                         'title',
01485
01486                         # WAI-ARIA
01487                         'role',
01488                 );
01489
01490                 if ( $wgAllowRdfaAttributes ) {
01491                         #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
01492                         $common = array_merge( $common, array(
01493                             'about', 'property', 'resource', 'datatype', 'typeof',
01494                         ) );
01495                 }
01496
01497                 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
01498                         # add HTML5 microdata tags as specified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
01499                         $common = array_merge( $common, array(
01500                             'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
01501                         ) );
01502                 }
01503
01504                 $block = array_merge( $common, array( 'align' ) );
01505                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
01506                 $tablecell = array( 'abbr',
01507                                     'axis',
01508                                     'headers',
01509                                     'scope',
01510                                     'rowspan',
01511                                     'colspan',
01512                                     'nowrap', # deprecated
01513                                     'width',  # deprecated
01514                                     'height', # deprecated
01515                                     'bgcolor' # deprecated
01516                                     );
01517
01518                 # Numbers refer to sections in HTML 4.01 standard describing the element.
01519                 # See: http://www.w3.org/TR/html4/
01520                 $whitelist = array(
01521                         # 7.5.4
01522                         'div'        => $block,
01523                         'center'     => $common, # deprecated
01524                         'span'       => $block, # ??
01525
01526                         # 7.5.5
01527                         'h1'         => $block,
01528                         'h2'         => $block,
01529                         'h3'         => $block,
01530                         'h4'         => $block,
01531                         'h5'         => $block,
01532                         'h6'         => $block,
01533
01534                         # 7.5.6
01535                         # address
01536
01537                         # 8.2.4
01538                         # bdo
01539
01540                         # 9.2.1
01541                         'em'         => $common,
01542                         'strong'     => $common,
01543                         'cite'       => $common,
01544                         'dfn'        => $common,
01545                         'code'       => $common,
01546                         'samp'       => $common,
01547                         'kbd'        => $common,
01548                         'var'        => $common,
01549                         'abbr'       => $common,
01550                         # acronym
01551
01552                         # 9.2.2
01553                         'blockquote' => array_merge( $common, array( 'cite' ) ),
01554                         # q
01555
01556                         # 9.2.3
01557                         'sub'        => $common,
01558                         'sup'        => $common,
01559
01560                         # 9.3.1
01561                         'p'          => $block,
01562
01563                         # 9.3.2
01564                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
01565
01566                         # 9.3.4
01567                         'pre'        => array_merge( $common, array( 'width' ) ),
01568
01569                         # 9.4
01570                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01571                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01572
01573                         # 10.2
01574                         'ul'         => array_merge( $common, array( 'type' ) ),
01575                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
01576                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
01577
01578                         # 10.3
01579                         'dl'         => $common,
01580                         'dd'         => $common,
01581                         'dt'         => $common,
01582
01583                         # 11.2.1
01584                         'table'      => array_merge( $common,
01585                                                                 array( 'summary', 'width', 'border', 'frame',
01586                                                                                 'rules', 'cellspacing', 'cellpadding',
01587                                                                                 'align', 'bgcolor',
01588                                                                 ) ),
01589
01590                         # 11.2.2
01591                         'caption'    => array_merge( $common, array( 'align' ) ),
01592
01593                         # 11.2.3
01594                         'thead'      => array_merge( $common, $tablealign ),
01595                         'tfoot'      => array_merge( $common, $tablealign ),
01596                         'tbody'      => array_merge( $common, $tablealign ),
01597
01598                         # 11.2.4
01599                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01600                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01601
01602                         # 11.2.5
01603                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01604
01605                         # 11.2.6
01606                         'td'         => array_merge( $common, $tablecell, $tablealign ),
01607                         'th'         => array_merge( $common, $tablecell, $tablealign ),
01608
01609                         # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
01610                         'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
01611
01612                         # 13.2
01613                         # Not usually allowed, but may be used for extension-style hooks
01614                         # such as <math> when it is rasterized, or if $wgAllowImageTag is
01615                         # true
01616                         'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
01617
01618                         # 15.2.1
01619                         'tt'         => $common,
01620                         'b'          => $common,
01621                         'i'          => $common,
01622                         'big'        => $common,
01623                         'small'      => $common,
01624                         'strike'     => $common,
01625                         's'          => $common,
01626                         'u'          => $common,
01627
01628                         # 15.2.2
01629                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
01630                         # basefont
01631
01632                         # 15.3
01633                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
01634
01635                         # XHTML Ruby annotation text module, simple ruby only.
01636                         # http://www.w3c.org/TR/ruby/
01637                         'ruby'       => $common,
01638                         # rbc
01639                         # rtc
01640                         'rb'         => $common,
01641                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
01642                         'rp'         => $common,
01643
01644                         # MathML root element, where used for extensions
01645                         # 'title' may not be 100% valid here; it's XHTML
01646                         # http://www.w3.org/TR/REC-MathML/
01647                         'math'       => array( 'class', 'style', 'id', 'title' ),
01648
01649                         # HTML 5 section 4.6
01650                         'bdi' => $common,
01651
01652                 );
01653
01654                 if ( $wgHtml5 ) {
01655                         # HTML5 elements, defined by:
01656                         # http://www.whatwg.org/specs/web-apps/current-work/multipage/
01657                         $whitelist += array(
01658                                 'data' => array_merge( $common, array( 'value' ) ),
01659                                 'time' => array_merge( $common, array( 'datetime' ) ),
01660                                 'mark' => $common,
01661
01662                                 // meta and link are only permitted by removeHTMLtags when Microdata
01663                                 // is enabled so we don't bother adding a conditional to hide these
01664                                 // Also meta and link are only valid in WikiText as Microdata elements
01665                                 // (ie: validateTag rejects tags missing the attributes needed for Microdata)
01666                                 // So we don't bother including $common attributes that have no purpose.
01667                                 'meta' => array( 'itemprop', 'content' ),
01668                                 'link' => array( 'itemprop', 'href' ),
01669                         );
01670                 }
01671
01672                 $staticInitialised = $globalContext;
01673
01674                 return $whitelist;
01675         }
01676
01687         static function stripAllTags( $text ) {
01688                 # Actual <tags>
01689                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01690
01691                 # Normalize &entities and whitespace
01692                 $text = self::decodeCharReferences( $text );
01693                 $text = self::normalizeWhitespace( $text );
01694
01695                 return $text;
01696         }
01697
01707         static function hackDocType() {
01708                 $out = "<!DOCTYPE html [\n";
01709                 foreach( self::$htmlEntities as $entity => $codepoint ) {
01710                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01711                 }
01712                 $out .= "]>\n";
01713                 return $out;
01714         }
01715
01720         static function cleanUrl( $url ) {
01721                 # Normalize any HTML entities in input. They will be
01722                 # re-escaped by makeExternalLink().
01723                 $url = Sanitizer::decodeCharReferences( $url );
01724
01725                 # Escape any control characters introduced by the above step
01726                 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
01727                         array( __CLASS__, 'cleanUrlCallback' ), $url );
01728
01729                 # Validate hostname portion
01730                 $matches = array();
01731                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01732                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
01733
01734                         // Characters that will be ignored in IDNs.
01735                         // http://tools.ietf.org/html/3454#section-3.1
01736                         // Strip them before further processing so blacklists and such work.
01737                         $strip = "/
01738                                 \\s|          # general whitespace
01739                                 \xc2\xad|     # 00ad SOFT HYPHEN
01740                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01741                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01742                                 \xe2\x81\xa0| # 2060 WORD JOINER
01743                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01744                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
01745                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01746                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01747                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01748                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01749                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01750                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
01751                                 /xuD";
01752
01753                         $host = preg_replace( $strip, '', $host );
01754
01755                         // @todo FIXME: Validate hostnames here
01756
01757                         return $protocol . $host . $rest;
01758                 } else {
01759                         return $url;
01760                 }
01761         }
01762
01767         static function cleanUrlCallback( $matches ) {
01768                 return urlencode( $matches[0] );
01769         }
01770
01799         public static function validateEmail( $addr ) {
01800                 $result = null;
01801                 if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
01802                         return $result;
01803                 }
01804
01805                 // Please note strings below are enclosed in brackets [], this make the
01806                 // hyphen "-" a range indicator. Hence it is double backslashed below.
01807                 // See bug 26948
01808                 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
01809                 $rfc1034_ldh_str = "a-z0-9\\-";
01810
01811                 $HTML5_email_regexp = "/
01812                 ^                      # start of string
01813                 [$rfc5322_atext\\.]+    # user part which is liberal :p
01814                 @                      # 'apostrophe'
01815                 [$rfc1034_ldh_str]+       # First domain part
01816                 (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
01817                 $                      # End of string
01818                 /ix"; // case Insensitive, eXtended
01819
01820                 return (bool) preg_match( $HTML5_email_regexp, $addr );
01821         }
01822 }