MediaWiki  REL1_21
Sanitizer.php
Go to the documentation of this file.
00001 <?php
00031 class Sanitizer {
00036         const CHAR_REFS_REGEX =
00037                 '/&([A-Za-z0-9\x80-\xff]+);
00038                  |&\#([0-9]+);
00039                  |&\#[xX]([0-9A-Fa-f]+);
00040                  |(&)/x';
00041 
00050         const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
00051         const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
00052 
00059         static $htmlEntities = array(
00060                 'Aacute'   => 193,
00061                 'aacute'   => 225,
00062                 'Acirc'    => 194,
00063                 'acirc'    => 226,
00064                 'acute'    => 180,
00065                 'AElig'    => 198,
00066                 'aelig'    => 230,
00067                 'Agrave'   => 192,
00068                 'agrave'   => 224,
00069                 'alefsym'  => 8501,
00070                 'Alpha'    => 913,
00071                 'alpha'    => 945,
00072                 'amp'      => 38,
00073                 'and'      => 8743,
00074                 'ang'      => 8736,
00075                 'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
00076                 'Aring'    => 197,
00077                 'aring'    => 229,
00078                 'asymp'    => 8776,
00079                 'Atilde'   => 195,
00080                 'atilde'   => 227,
00081                 'Auml'     => 196,
00082                 'auml'     => 228,
00083                 'bdquo'    => 8222,
00084                 'Beta'     => 914,
00085                 'beta'     => 946,
00086                 'brvbar'   => 166,
00087                 'bull'     => 8226,
00088                 'cap'      => 8745,
00089                 'Ccedil'   => 199,
00090                 'ccedil'   => 231,
00091                 'cedil'    => 184,
00092                 'cent'     => 162,
00093                 'Chi'      => 935,
00094                 'chi'      => 967,
00095                 'circ'     => 710,
00096                 'clubs'    => 9827,
00097                 'cong'     => 8773,
00098                 'copy'     => 169,
00099                 'crarr'    => 8629,
00100                 'cup'      => 8746,
00101                 'curren'   => 164,
00102                 'dagger'   => 8224,
00103                 'Dagger'   => 8225,
00104                 'darr'     => 8595,
00105                 'dArr'     => 8659,
00106                 'deg'      => 176,
00107                 'Delta'    => 916,
00108                 'delta'    => 948,
00109                 'diams'    => 9830,
00110                 'divide'   => 247,
00111                 'Eacute'   => 201,
00112                 'eacute'   => 233,
00113                 'Ecirc'    => 202,
00114                 'ecirc'    => 234,
00115                 'Egrave'   => 200,
00116                 'egrave'   => 232,
00117                 'empty'    => 8709,
00118                 'emsp'     => 8195,
00119                 'ensp'     => 8194,
00120                 'Epsilon'  => 917,
00121                 'epsilon'  => 949,
00122                 'equiv'    => 8801,
00123                 'Eta'      => 919,
00124                 'eta'      => 951,
00125                 'ETH'      => 208,
00126                 'eth'      => 240,
00127                 'Euml'     => 203,
00128                 'euml'     => 235,
00129                 'euro'     => 8364,
00130                 'exist'    => 8707,
00131                 'fnof'     => 402,
00132                 'forall'   => 8704,
00133                 'frac12'   => 189,
00134                 'frac14'   => 188,
00135                 'frac34'   => 190,
00136                 'frasl'    => 8260,
00137                 'Gamma'    => 915,
00138                 'gamma'    => 947,
00139                 'ge'       => 8805,
00140                 'gt'       => 62,
00141                 'harr'     => 8596,
00142                 'hArr'     => 8660,
00143                 'hearts'   => 9829,
00144                 'hellip'   => 8230,
00145                 'Iacute'   => 205,
00146                 'iacute'   => 237,
00147                 'Icirc'    => 206,
00148                 'icirc'    => 238,
00149                 'iexcl'    => 161,
00150                 'Igrave'   => 204,
00151                 'igrave'   => 236,
00152                 'image'    => 8465,
00153                 'infin'    => 8734,
00154                 'int'      => 8747,
00155                 'Iota'     => 921,
00156                 'iota'     => 953,
00157                 'iquest'   => 191,
00158                 'isin'     => 8712,
00159                 'Iuml'     => 207,
00160                 'iuml'     => 239,
00161                 'Kappa'    => 922,
00162                 'kappa'    => 954,
00163                 'Lambda'   => 923,
00164                 'lambda'   => 955,
00165                 'lang'     => 9001,
00166                 'laquo'    => 171,
00167                 'larr'     => 8592,
00168                 'lArr'     => 8656,
00169                 'lceil'    => 8968,
00170                 'ldquo'    => 8220,
00171                 'le'       => 8804,
00172                 'lfloor'   => 8970,
00173                 'lowast'   => 8727,
00174                 'loz'      => 9674,
00175                 'lrm'      => 8206,
00176                 'lsaquo'   => 8249,
00177                 'lsquo'    => 8216,
00178                 'lt'       => 60,
00179                 'macr'     => 175,
00180                 'mdash'    => 8212,
00181                 'micro'    => 181,
00182                 'middot'   => 183,
00183                 'minus'    => 8722,
00184                 'Mu'       => 924,
00185                 'mu'       => 956,
00186                 'nabla'    => 8711,
00187                 'nbsp'     => 160,
00188                 'ndash'    => 8211,
00189                 'ne'       => 8800,
00190                 'ni'       => 8715,
00191                 'not'      => 172,
00192                 'notin'    => 8713,
00193                 'nsub'     => 8836,
00194                 'Ntilde'   => 209,
00195                 'ntilde'   => 241,
00196                 'Nu'       => 925,
00197                 'nu'       => 957,
00198                 'Oacute'   => 211,
00199                 'oacute'   => 243,
00200                 'Ocirc'    => 212,
00201                 'ocirc'    => 244,
00202                 'OElig'    => 338,
00203                 'oelig'    => 339,
00204                 'Ograve'   => 210,
00205                 'ograve'   => 242,
00206                 'oline'    => 8254,
00207                 'Omega'    => 937,
00208                 'omega'    => 969,
00209                 'Omicron'  => 927,
00210                 'omicron'  => 959,
00211                 'oplus'    => 8853,
00212                 'or'       => 8744,
00213                 'ordf'     => 170,
00214                 'ordm'     => 186,
00215                 'Oslash'   => 216,
00216                 'oslash'   => 248,
00217                 'Otilde'   => 213,
00218                 'otilde'   => 245,
00219                 'otimes'   => 8855,
00220                 'Ouml'     => 214,
00221                 'ouml'     => 246,
00222                 'para'     => 182,
00223                 'part'     => 8706,
00224                 'permil'   => 8240,
00225                 'perp'     => 8869,
00226                 'Phi'      => 934,
00227                 'phi'      => 966,
00228                 'Pi'       => 928,
00229                 'pi'       => 960,
00230                 'piv'      => 982,
00231                 'plusmn'   => 177,
00232                 'pound'    => 163,
00233                 'prime'    => 8242,
00234                 'Prime'    => 8243,
00235                 'prod'     => 8719,
00236                 'prop'     => 8733,
00237                 'Psi'      => 936,
00238                 'psi'      => 968,
00239                 'quot'     => 34,
00240                 'radic'    => 8730,
00241                 'rang'     => 9002,
00242                 'raquo'    => 187,
00243                 'rarr'     => 8594,
00244                 'rArr'     => 8658,
00245                 'rceil'    => 8969,
00246                 'rdquo'    => 8221,
00247                 'real'     => 8476,
00248                 'reg'      => 174,
00249                 'rfloor'   => 8971,
00250                 'Rho'      => 929,
00251                 'rho'      => 961,
00252                 'rlm'      => 8207,
00253                 'rsaquo'   => 8250,
00254                 'rsquo'    => 8217,
00255                 'sbquo'    => 8218,
00256                 'Scaron'   => 352,
00257                 'scaron'   => 353,
00258                 'sdot'     => 8901,
00259                 'sect'     => 167,
00260                 'shy'      => 173,
00261                 'Sigma'    => 931,
00262                 'sigma'    => 963,
00263                 'sigmaf'   => 962,
00264                 'sim'      => 8764,
00265                 'spades'   => 9824,
00266                 'sub'      => 8834,
00267                 'sube'     => 8838,
00268                 'sum'      => 8721,
00269                 'sup'      => 8835,
00270                 'sup1'     => 185,
00271                 'sup2'     => 178,
00272                 'sup3'     => 179,
00273                 'supe'     => 8839,
00274                 'szlig'    => 223,
00275                 'Tau'      => 932,
00276                 'tau'      => 964,
00277                 'there4'   => 8756,
00278                 'Theta'    => 920,
00279                 'theta'    => 952,
00280                 'thetasym' => 977,
00281                 'thinsp'   => 8201,
00282                 'THORN'    => 222,
00283                 'thorn'    => 254,
00284                 'tilde'    => 732,
00285                 'times'    => 215,
00286                 'trade'    => 8482,
00287                 'Uacute'   => 218,
00288                 'uacute'   => 250,
00289                 'uarr'     => 8593,
00290                 'uArr'     => 8657,
00291                 'Ucirc'    => 219,
00292                 'ucirc'    => 251,
00293                 'Ugrave'   => 217,
00294                 'ugrave'   => 249,
00295                 'uml'      => 168,
00296                 'upsih'    => 978,
00297                 'Upsilon'  => 933,
00298                 'upsilon'  => 965,
00299                 'Uuml'     => 220,
00300                 'uuml'     => 252,
00301                 'weierp'   => 8472,
00302                 'Xi'       => 926,
00303                 'xi'       => 958,
00304                 'Yacute'   => 221,
00305                 'yacute'   => 253,
00306                 'yen'      => 165,
00307                 'Yuml'     => 376,
00308                 'yuml'     => 255,
00309                 'Zeta'     => 918,
00310                 'zeta'     => 950,
00311                 'zwj'      => 8205,
00312                 'zwnj'     => 8204
00313         );
00314 
00318         static $htmlEntityAliases = array(
00319                 'רלמ' => 'rlm',
00320                 'رلم' => 'rlm',
00321         );
00322 
00326         static $attribsRegex;
00327 
00333         static function getAttribsRegex() {
00334                 if ( self::$attribsRegex === null ) {
00335                         $attribFirst = '[:A-Z_a-z0-9]';
00336                         $attrib = '[:A-Z_a-z-.0-9]';
00337                         $space = '[\x09\x0a\x0d\x20]';
00338                         self::$attribsRegex =
00339                                 "/(?:^|$space)({$attribFirst}{$attrib}*)
00340                                   ($space*=$space*
00341                                         (?:
00342                                          # The attribute value: quoted or alone
00343                                           \"([^<\"]*)\"
00344                                          | '([^<']*)'
00345                                          |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00346                                          |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00347                                                                                  # colors are specified like this.
00348                                                                                  # We'll be normalizing it.
00349                                         )
00350                                 )?(?=$space|\$)/sx";
00351                 }
00352                 return self::$attribsRegex;
00353         }
00354 
00366         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
00367                 global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag;
00368 
00369                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00370                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
00371 
00372                 wfProfileIn( __METHOD__ );
00373 
00374                 // Base our staticInitialised variable off of the global config state so that if the globals
00375                 // are changed (like in the screwed up test system) we will re-initialise the settings.
00376                 $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
00377                 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
00378 
00379                         $htmlpairsStatic = array( # Tags that must be closed
00380                                 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00381                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00382                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
00383                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00384                                 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
00385                                 'kbd', 'samp'
00386                         );
00387                         if ( $wgHtml5 ) {
00388                                 $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) );
00389                         }
00390                         $htmlsingle = array(
00391                                 'br', 'hr', 'li', 'dt', 'dd'
00392                         );
00393                         $htmlsingleonly = array( # Elements that cannot have close tags
00394                                 'br', 'hr'
00395                         );
00396                         if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
00397                                 $htmlsingle[] = $htmlsingleonly[] = 'meta';
00398                                 $htmlsingle[] = $htmlsingleonly[] = 'link';
00399                         }
00400                         $htmlnest = array( # Tags that can be nested--??
00401                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00402                                 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span'
00403                         );
00404                         $tabletags = array( # Can only appear inside table, we will close them
00405                                 'td', 'th', 'tr',
00406                         );
00407                         $htmllist = array( # Tags used by list
00408                                 'ul','ol',
00409                         );
00410                         $listtags = array( # Tags that can appear in a list
00411                                 'li',
00412                         );
00413 
00414                         if ( $wgAllowImageTag ) {
00415                                 $htmlsingle[] = 'img';
00416                                 $htmlsingleonly[] = 'img';
00417                         }
00418 
00419                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
00420                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
00421 
00422                         # Convert them all to hashtables for faster lookup
00423                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00424                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
00425                         foreach ( $vars as $var ) {
00426                                 $$var = array_flip( $$var );
00427                         }
00428                         $staticInitialised = $globalContext;
00429                 }
00430                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
00431                 $extratags = array_flip( $extratags );
00432                 $removetags = array_flip( $removetags );
00433                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
00434                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
00435 
00436                 # Remove HTML comments
00437                 $text = Sanitizer::removeHTMLcomments( $text );
00438                 $bits = explode( '<', $text );
00439                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
00440                 if ( !$wgUseTidy ) {
00441                         $tagstack = $tablestack = array();
00442                         foreach ( $bits as $x ) {
00443                                 $regs = array();
00444                                 # $slash: Does the current element start with a '/'?
00445                                 # $t: Current element name
00446                                 # $params: String between element name and >
00447                                 # $brace: Ending '>' or '/>'
00448                                 # $rest: Everything until the next element of $bits
00449                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00450                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00451                                 } else {
00452                                         $slash = $t = $params = $brace = $rest = null;
00453                                 }
00454 
00455                                 $badtag = false;
00456                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00457                                         # Check our stack
00458                                         if ( $slash && isset( $htmlsingleonly[$t] ) ) {
00459                                                 $badtag = true;
00460                                         } elseif ( $slash ) {
00461                                                 # Closing a tag... is it the one we just opened?
00462                                                 $ot = @array_pop( $tagstack );
00463                                                 if ( $ot != $t ) {
00464                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
00465                                                                 # Pop all elements with an optional close tag
00466                                                                 # and see if we find a match below them
00467                                                                 $optstack = array();
00468                                                                 array_push( $optstack, $ot );
00469                                                                 wfSuppressWarnings();
00470                                                                 $ot = array_pop( $tagstack );
00471                                                                 wfRestoreWarnings();
00472                                                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
00473                                                                         array_push( $optstack, $ot );
00474                                                                         wfSuppressWarnings();
00475                                                                         $ot = array_pop( $tagstack );
00476                                                                         wfRestoreWarnings();
00477                                                                 }
00478                                                                 if ( $t != $ot ) {
00479                                                                         # No match. Push the optional elements back again
00480                                                                         $badtag = true;
00481                                                                         wfSuppressWarnings();
00482                                                                         $ot = array_pop( $optstack );
00483                                                                         wfRestoreWarnings();
00484                                                                         while ( $ot ) {
00485                                                                                 array_push( $tagstack, $ot );
00486                                                                                 wfSuppressWarnings();
00487                                                                                 $ot = array_pop( $optstack );
00488                                                                                 wfRestoreWarnings();
00489                                                                         }
00490                                                                 }
00491                                                         } else {
00492                                                                 @array_push( $tagstack, $ot );
00493                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
00494                                                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
00495                                                                         $badtag = true;
00496                                                                 }
00497                                                         }
00498                                                 } else {
00499                                                         if ( $t == 'table' ) {
00500                                                                 $tagstack = array_pop( $tablestack );
00501                                                         }
00502                                                 }
00503                                                 $newparams = '';
00504                                         } else {
00505                                                 # Keep track for later
00506                                                 if ( isset( $tabletags[$t] ) &&
00507                                                 !in_array( 'table', $tagstack ) ) {
00508                                                         $badtag = true;
00509                                                 } elseif ( in_array( $t, $tagstack ) &&
00510                                                 !isset( $htmlnest [$t ] ) ) {
00511                                                         $badtag = true;
00512                                                 # Is it a self closed htmlpair ? (bug 5487)
00513                                                 } elseif ( $brace == '/>' &&
00514                                                 isset( $htmlpairs[$t] ) ) {
00515                                                         $badtag = true;
00516                                                 } elseif ( isset( $htmlsingleonly[$t] ) ) {
00517                                                         # Hack to force empty tag for unclosable elements
00518                                                         $brace = '/>';
00519                                                 } elseif ( isset( $htmlsingle[$t] ) ) {
00520                                                         # Hack to not close $htmlsingle tags
00521                                                         $brace = null;
00522                                                         # Still need to push this optionally-closed tag to
00523                                                         # the tag stack so that we can match end tags
00524                                                         # instead of marking them as bad.
00525                                                         array_push( $tagstack, $t );
00526                                                 } elseif ( isset( $tabletags[$t] )
00527                                                 && in_array( $t, $tagstack ) ) {
00528                                                         // New table tag but forgot to close the previous one
00529                                                         $text .= "</$t>";
00530                                                 } else {
00531                                                         if ( $t == 'table' ) {
00532                                                                 array_push( $tablestack, $tagstack );
00533                                                                 $tagstack = array();
00534                                                         }
00535                                                         array_push( $tagstack, $t );
00536                                                 }
00537 
00538                                                 # Replace any variables or template parameters with
00539                                                 # plaintext results.
00540                                                 if( is_callable( $processCallback ) ) {
00541                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
00542                                                 }
00543 
00544                                                 if ( !Sanitizer::validateTag( $params, $t ) ) {
00545                                                         $badtag = true;
00546                                                 }
00547 
00548                                                 # Strip non-approved attributes from the tag
00549                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00550                                         }
00551                                         if ( !$badtag ) {
00552                                                 $rest = str_replace( '>', '&gt;', $rest );
00553                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00554                                                 $text .= "<$slash$t$newparams$close>$rest";
00555                                                 continue;
00556                                         }
00557                                 }
00558                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00559                         }
00560                         # Close off any remaining tags
00561                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
00562                                 $text .= "</$t>\n";
00563                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
00564                         }
00565                 } else {
00566                         # this might be possible using tidy itself
00567                         foreach ( $bits as $x ) {
00568                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00569                                 $x, $regs );
00570                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00571                                 $badtag = false;
00572                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00573                                         if( is_callable( $processCallback ) ) {
00574                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
00575                                         }
00576 
00577                                         if ( !Sanitizer::validateTag( $params, $t ) ) {
00578                                                 $badtag = true;
00579                                         }
00580 
00581                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
00582                                         if ( !$badtag ) {
00583                                                 $rest = str_replace( '>', '&gt;', $rest );
00584                                                 $text .= "<$slash$t$newparams$brace$rest";
00585                                                 continue;
00586                                         }
00587                                 }
00588                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00589                         }
00590                 }
00591                 wfProfileOut( __METHOD__ );
00592                 return $text;
00593         }
00594 
00605         static function removeHTMLcomments( $text ) {
00606                 wfProfileIn( __METHOD__ );
00607                 while ( ($start = strpos( $text, '<!--' ) ) !== false ) {
00608                         $end = strpos( $text, '-->', $start + 4 );
00609                         if ( $end === false ) {
00610                                 # Unterminated comment; bail out
00611                                 break;
00612                         }
00613 
00614                         $end += 3;
00615 
00616                         # Trim space and newline if the comment is both
00617                         # preceded and followed by a newline
00618                         $spaceStart = max( $start - 1, 0 );
00619                         $spaceLen = $end - $spaceStart;
00620                         while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
00621                                 $spaceStart--;
00622                                 $spaceLen++;
00623                         }
00624                         while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' )
00625                                 $spaceLen++;
00626                         if ( substr( $text, $spaceStart, 1 ) === "\n" and substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
00627                                 # Remove the comment, leading and trailing
00628                                 # spaces, and leave only one newline.
00629                                 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
00630                         }
00631                         else {
00632                                 # Remove just the comment.
00633                                 $text = substr_replace( $text, '', $start, $end - $start );
00634                         }
00635                 }
00636                 wfProfileOut( __METHOD__ );
00637                 return $text;
00638         }
00639 
00652         static function validateTag( $params, $element ) {
00653                 $params = Sanitizer::decodeTagAttributes( $params );
00654 
00655                 if ( $element == 'meta' || $element == 'link' ) {
00656                         if ( !isset( $params['itemprop'] ) ) {
00657                                 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
00658                                 return false;
00659                         }
00660                         if ( $element == 'meta' && !isset( $params['content'] ) ) {
00661                                 // <meta> must have a content="" for the itemprop
00662                                 return false;
00663                         }
00664                         if ( $element == 'link' && !isset( $params['href'] ) ) {
00665                                 // <link> must have an associated href=""
00666                                 return false;
00667                         }
00668                 }
00669 
00670                 return true;
00671         }
00672 
00688         static function validateTagAttributes( $attribs, $element ) {
00689                 return Sanitizer::validateAttributes( $attribs,
00690                         Sanitizer::attributeWhitelist( $element ) );
00691         }
00692 
00708         static function validateAttributes( $attribs, $whitelist ) {
00709                 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
00710 
00711                 $whitelist = array_flip( $whitelist );
00712                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
00713 
00714                 $out = array();
00715                 foreach( $attribs as $attribute => $value ) {
00716                         #allow XML namespace declaration if RDFa is enabled
00717                         if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
00718                                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00719                                         $out[$attribute] = $value;
00720                                 }
00721 
00722                                 continue;
00723                         }
00724 
00725                         # Allow any attribute beginning with "data-", if in HTML5 mode
00726                         if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
00727                                 continue;
00728                         }
00729 
00730                         # Strip javascript "expression" from stylesheets.
00731                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00732                         if( $attribute == 'style' ) {
00733                                 $value = Sanitizer::checkCss( $value );
00734                         }
00735 
00736                         if ( $attribute === 'id' ) {
00737                                 $value = Sanitizer::escapeId( $value, 'noninitial' );
00738                         }
00739 
00740                         # WAI-ARIA
00741                         # http://www.w3.org/TR/wai-aria/
00742                         # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#wai-aria
00743                         # For now we only support role="presentation" until we work out what roles should be
00744                         # usable by content and we ensure that our code explicitly rejects patterns that
00745                         # violate HTML5's ARIA restrictions.
00746                         if ( $attribute === 'role' && $value !== 'presentation' ) {
00747                                 continue;
00748                         }
00749 
00750                         //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
00751                         if ( $attribute === 'rel' || $attribute === 'rev' ||
00752                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
00753                                 $attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
00754                                 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
00755                                 $attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
00756 
00757                                 //Paranoia. Allow "simple" values but suppress javascript
00758                                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00759                                         continue;
00760                                 }
00761                         }
00762 
00763                         # NOTE: even though elements using href/src are not allowed directly, supply
00764                         #       validation code that can be used by tag hook handlers, etc
00765                         if ( $attribute === 'href' || $attribute === 'src' ) {
00766                                 if ( !preg_match( $hrefExp, $value ) ) {
00767                                         continue; //drop any href or src attributes not using an allowed protocol.
00768                                                   //NOTE: this also drops all relative URLs
00769                                 }
00770                         }
00771 
00772                         // If this attribute was previously set, override it.
00773                         // Output should only have one attribute of each name.
00774                         $out[$attribute] = $value;
00775                 }
00776 
00777                 if ( $wgAllowMicrodataAttributes ) {
00778                         # itemtype, itemid, itemref don't make sense without itemscope
00779                         if ( !array_key_exists( 'itemscope', $out ) ) {
00780                                 unset( $out['itemtype'] );
00781                                 unset( $out['itemid'] );
00782                                 unset( $out['itemref'] );
00783                         }
00784                         # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
00785                 }
00786                 return $out;
00787         }
00788 
00799         static function mergeAttributes( $a, $b ) {
00800                 $out = array_merge( $a, $b );
00801                 if( isset( $a['class'] ) && isset( $b['class'] )
00802                 && is_string( $a['class'] ) && is_string( $b['class'] )
00803                 && $a['class'] !== $b['class'] ) {
00804                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00805                                 -1, PREG_SPLIT_NO_EMPTY );
00806                         $out['class'] = implode( ' ', array_unique( $classes ) );
00807                 }
00808                 return $out;
00809         }
00810 
00828         static function checkCss( $value ) {
00829                 // Decode character references like &#123;
00830                 $value = Sanitizer::decodeCharReferences( $value );
00831 
00832                 // Decode escape sequences and line continuation
00833                 // See the grammar in the CSS 2 spec, appendix D.
00834                 // This has to be done AFTER decoding character references.
00835                 // This means it isn't possible for this function to return
00836                 // unsanitized escape sequences. It is possible to manufacture
00837                 // input that contains character references that decode to
00838                 // escape sequences that decode to character references, but
00839                 // it's OK for the return value to contain character references
00840                 // because the caller is supposed to escape those anyway.
00841                 static $decodeRegex;
00842                 if ( !$decodeRegex ) {
00843                         $space = '[\\x20\\t\\r\\n\\f]';
00844                         $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00845                         $backslash = '\\\\';
00846                         $decodeRegex = "/ $backslash
00847                                 (?:
00848                                         ($nl) |  # 1. Line continuation
00849                                         ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
00850                                         (.) | # 3. backslash cancelling special meaning
00851                                         () | # 4. backslash at end of string
00852                                 )/xu";
00853                 }
00854                 $value = preg_replace_callback( $decodeRegex,
00855                         array( __CLASS__, 'cssDecodeCallback' ), $value );
00856 
00857                 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
00858                 $value = preg_replace_callback(
00859                         '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
00860                         function ( $matches ) {
00861                                 $cp = utf8ToCodepoint( $matches[0] );
00862                                 if ( $cp === false ) {
00863                                         return '';
00864                                 }
00865                                 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
00866                         },
00867                         $value
00868                 );
00869 
00870                 // Convert more characters IE6 might treat as ascii
00871                 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
00872                 $value = str_replace(
00873                         array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
00874                         array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
00875                         $value
00876                 );
00877 
00878                 // Remove any comments; IE gets token splitting wrong
00879                 // This must be done AFTER decoding character references and
00880                 // escape sequences, because those steps can introduce comments
00881                 // This step cannot introduce character references or escape
00882                 // sequences, because it replaces comments with spaces rather
00883                 // than removing them completely.
00884                 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00885 
00886                 // Remove anything after a comment-start token, to guard against
00887                 // incorrect client implementations.
00888                 $commentPos = strpos( $value, '/*' );
00889                 if ( $commentPos !== false ) {
00890                         $value = substr( $value, 0, $commentPos );
00891                 }
00892 
00893                 // S followed by repeat, iteration, or prolonged sound marks,
00894                 // which IE will treat as "ss"
00895                 $value = preg_replace(
00896                         '/s(?:
00897                                 \xE3\x80\xB1 | # U+3031
00898                                 \xE3\x82\x9D | # U+309D
00899                                 \xE3\x83\xBC | # U+30FC
00900                                 \xE3\x83\xBD | # U+30FD
00901                                 \xEF\xB9\xBC | # U+FE7C
00902                                 \xEF\xB9\xBD | # U+FE7D
00903                                 \xEF\xBD\xB0   # U+FF70
00904                         )/ix',
00905                         'ss',
00906                         $value
00907                 );
00908 
00909                 // Reject problematic keywords and control characters
00910                 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
00911                         return '/* invalid control char */';
00912                 } elseif ( preg_match(
00913                         '! expression
00914                                 | filter\s*:
00915                                 | accelerator\s*:
00916                                 | -o-link\s*:
00917                                 | -o-link-source\s*:
00918                                 | -o-replace\s*:
00919                                 | url\s*\(
00920                                 | image\s*\(
00921                                 | image-set\s*\(
00922                         !ix', $value ) ) {
00923                         return '/* insecure input */';
00924                 }
00925                 return $value;
00926         }
00927 
00932         static function cssDecodeCallback( $matches ) {
00933                 if ( $matches[1] !== '' ) {
00934                         // Line continuation
00935                         return '';
00936                 } elseif ( $matches[2] !== '' ) {
00937                         $char = codepointToUtf8( hexdec( $matches[2] ) );
00938                 } elseif ( $matches[3] !== '' ) {
00939                         $char = $matches[3];
00940                 } else {
00941                         $char = '\\';
00942                 }
00943                 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00944                         // These characters need to be escaped in strings
00945                         // Clean up the escape sequence to avoid parsing errors by clients
00946                         return '\\' . dechex( ord( $char ) ) . ' ';
00947                 } else {
00948                         // Decode unnecessary escape
00949                         return $char;
00950                 }
00951         }
00952 
00972         static function fixTagAttributes( $text, $element ) {
00973                 if( trim( $text ) == '' ) {
00974                         return '';
00975                 }
00976 
00977                 $decoded = Sanitizer::decodeTagAttributes( $text );
00978                 $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
00979 
00980                 $attribs = array();
00981                 foreach( $stripped as $attribute => $value ) {
00982                         $encAttribute = htmlspecialchars( $attribute );
00983                         $encValue = Sanitizer::safeEncodeAttribute( $value );
00984 
00985                         $attribs[] = "$encAttribute=\"$encValue\"";
00986                 }
00987                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
00988         }
00989 
00995         static function encodeAttribute( $text ) {
00996                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
00997 
00998                 // Whitespace is normalized during attribute decoding,
00999                 // so if we've been passed non-spaces we must encode them
01000                 // ahead of time or they won't be preserved.
01001                 $encValue = strtr( $encValue, array(
01002                         "\n" => '&#10;',
01003                         "\r" => '&#13;',
01004                         "\t" => '&#9;',
01005                 ) );
01006 
01007                 return $encValue;
01008         }
01009 
01016         static function safeEncodeAttribute( $text ) {
01017                 $encValue = Sanitizer::encodeAttribute( $text );
01018 
01019                 # Templates and links may be expanded in later parsing,
01020                 # creating invalid or dangerous output. Suppress this.
01021                 $encValue = strtr( $encValue, array(
01022                         '<'    => '&lt;',   // This should never happen,
01023                         '>'    => '&gt;',   // we've received invalid input
01024                         '"'    => '&quot;', // which should have been escaped.
01025                         '{'    => '&#123;',
01026                         '['    => '&#91;',
01027                         "''"   => '&#39;&#39;',
01028                         'ISBN' => '&#73;SBN',
01029                         'RFC'  => '&#82;FC',
01030                         'PMID' => '&#80;MID',
01031                         '|'    => '&#124;',
01032                         '__'   => '&#95;_',
01033                 ) );
01034 
01035                 # Stupid hack
01036                 $encValue = preg_replace_callback(
01037                         '/((?i)' . wfUrlProtocols() . ')/',
01038                         array( 'Sanitizer', 'armorLinksCallback' ),
01039                         $encValue );
01040                 return $encValue;
01041         }
01042 
01074         static function escapeId( $id, $options = array() ) {
01075                 global $wgHtml5, $wgExperimentalHtmlIds;
01076                 $options = (array)$options;
01077 
01078                 if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
01079                         $id = Sanitizer::decodeCharReferences( $id );
01080                         $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
01081                         $id = trim( $id, '_' );
01082                         if ( $id === '' ) {
01083                                 # Must have been all whitespace to start with.
01084                                 return '_';
01085                         } else {
01086                                 return $id;
01087                         }
01088                 }
01089 
01090                 # HTML4-style escaping
01091                 static $replace = array(
01092                         '%3A' => ':',
01093                         '%' => '.'
01094                 );
01095 
01096                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
01097                 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
01098 
01099                 if ( !preg_match( '/^[a-zA-Z]/', $id )
01100                 && !in_array( 'noninitial', $options ) ) {
01101                         // Initial character must be a letter!
01102                         $id = "x$id";
01103                 }
01104                 return $id;
01105         }
01106 
01118         static function escapeClass( $class ) {
01119                 // Convert ugly stuff to underscores and kill underscores in ugly places
01120                 return rtrim( preg_replace(
01121                         array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
01122                         '_',
01123                         $class ), '_' );
01124         }
01125 
01133         static function escapeHtmlAllowEntities( $html ) {
01134                 $html = Sanitizer::decodeCharReferences( $html );
01135                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
01136                 # hurt.
01137                 $html = htmlspecialchars( $html, ENT_QUOTES );
01138                 return $html;
01139         }
01140 
01146         private static function armorLinksCallback( $matches ) {
01147                 return str_replace( ':', '&#58;', $matches[1] );
01148         }
01149 
01158         public static function decodeTagAttributes( $text ) {
01159                 if( trim( $text ) == '' ) {
01160                         return array();
01161                 }
01162 
01163                 $attribs = array();
01164                 $pairs = array();
01165                 if( !preg_match_all(
01166                         self::getAttribsRegex(),
01167                         $text,
01168                         $pairs,
01169                         PREG_SET_ORDER ) ) {
01170                         return $attribs;
01171                 }
01172 
01173                 foreach( $pairs as $set ) {
01174                         $attribute = strtolower( $set[1] );
01175                         $value = Sanitizer::getTagAttributeCallback( $set );
01176 
01177                         // Normalize whitespace
01178                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
01179                         $value = trim( $value );
01180 
01181                         // Decode character references
01182                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
01183                 }
01184                 return $attribs;
01185         }
01186 
01195         private static function getTagAttributeCallback( $set ) {
01196                 if( isset( $set[6] ) ) {
01197                         # Illegal #XXXXXX color with no quotes.
01198                         return $set[6];
01199                 } elseif( isset( $set[5] ) ) {
01200                         # No quotes.
01201                         return $set[5];
01202                 } elseif( isset( $set[4] ) ) {
01203                         # Single-quoted
01204                         return $set[4];
01205                 } elseif( isset( $set[3] ) ) {
01206                         # Double-quoted
01207                         return $set[3];
01208                 } elseif( !isset( $set[2] ) ) {
01209                         # In XHTML, attributes must have a value.
01210                         # For 'reduced' form, return explicitly the attribute name here.
01211                         return $set[1];
01212                 } else {
01213                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
01214                 }
01215         }
01216 
01228         private static function normalizeAttributeValue( $text ) {
01229                 return str_replace( '"', '&quot;',
01230                         self::normalizeWhitespace(
01231                                 Sanitizer::normalizeCharReferences( $text ) ) );
01232         }
01233 
01238         private static function normalizeWhitespace( $text ) {
01239                 return preg_replace(
01240                         '/\r\n|[\x20\x0d\x0a\x09]/',
01241                         ' ',
01242                         $text );
01243         }
01244 
01253         static function normalizeSectionNameWhitespace( $section ) {
01254                 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
01255         }
01256 
01272         static function normalizeCharReferences( $text ) {
01273                 return preg_replace_callback(
01274                         self::CHAR_REFS_REGEX,
01275                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01276                         $text );
01277         }
01282         static function normalizeCharReferencesCallback( $matches ) {
01283                 $ret = null;
01284                 if( $matches[1] != '' ) {
01285                         $ret = Sanitizer::normalizeEntity( $matches[1] );
01286                 } elseif( $matches[2] != '' ) {
01287                         $ret = Sanitizer::decCharReference( $matches[2] );
01288                 } elseif( $matches[3] != '' ) {
01289                         $ret = Sanitizer::hexCharReference( $matches[3] );
01290                 }
01291                 if( is_null( $ret ) ) {
01292                         return htmlspecialchars( $matches[0] );
01293                 } else {
01294                         return $ret;
01295                 }
01296         }
01297 
01308         static function normalizeEntity( $name ) {
01309                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
01310                         return '&' . self::$htmlEntityAliases[$name] . ';';
01311                 } elseif ( in_array( $name,
01312                 array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
01313                         return "&$name;";
01314                 } elseif ( isset( self::$htmlEntities[$name] ) ) {
01315                         return '&#' . self::$htmlEntities[$name] . ';';
01316                 } else {
01317                         return "&amp;$name;";
01318                 }
01319         }
01320 
01325         static function decCharReference( $codepoint ) {
01326                 $point = intval( $codepoint );
01327                 if( Sanitizer::validateCodepoint( $point ) ) {
01328                         return sprintf( '&#%d;', $point );
01329                 } else {
01330                         return null;
01331                 }
01332         }
01333 
01338         static function hexCharReference( $codepoint ) {
01339                 $point = hexdec( $codepoint );
01340                 if( Sanitizer::validateCodepoint( $point ) ) {
01341                         return sprintf( '&#x%x;', $point );
01342                 } else {
01343                         return null;
01344                 }
01345         }
01346 
01352         private static function validateCodepoint( $codepoint ) {
01353                 return ($codepoint ==    0x09)
01354                         || ($codepoint ==    0x0a)
01355                         || ($codepoint ==    0x0d)
01356                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
01357                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
01358                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
01359         }
01360 
01368         public static function decodeCharReferences( $text ) {
01369                 return preg_replace_callback(
01370                         self::CHAR_REFS_REGEX,
01371                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01372                         $text );
01373         }
01374 
01385         public static function decodeCharReferencesAndNormalize( $text ) {
01386                 global $wgContLang;
01387                 $text = preg_replace_callback(
01388                         self::CHAR_REFS_REGEX,
01389                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01390                         $text, /* limit */ -1, $count );
01391 
01392                 if ( $count ) {
01393                         return $wgContLang->normalize( $text );
01394                 } else {
01395                         return $text;
01396                 }
01397         }
01398 
01403         static function decodeCharReferencesCallback( $matches ) {
01404                 if( $matches[1] != '' ) {
01405                         return Sanitizer::decodeEntity( $matches[1] );
01406                 } elseif( $matches[2] != '' ) {
01407                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
01408                 } elseif( $matches[3] != '' ) {
01409                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
01410                 }
01411                 # Last case should be an ampersand by itself
01412                 return $matches[0];
01413         }
01414 
01422         static function decodeChar( $codepoint ) {
01423                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
01424                         return codepointToUtf8( $codepoint );
01425                 } else {
01426                         return UTF8_REPLACEMENT;
01427                 }
01428         }
01429 
01438         static function decodeEntity( $name ) {
01439                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
01440                         $name = self::$htmlEntityAliases[$name];
01441                 }
01442                 if( isset( self::$htmlEntities[$name] ) ) {
01443                         return codepointToUtf8( self::$htmlEntities[$name] );
01444                 } else {
01445                         return "&$name;";
01446                 }
01447         }
01448 
01455         static function attributeWhitelist( $element ) {
01456                 $list = Sanitizer::setupAttributeWhitelist();
01457                 return isset( $list[$element] )
01458                         ? $list[$element]
01459                         : array();
01460         }
01461 
01467         static function setupAttributeWhitelist() {
01468                 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
01469 
01470                 static $whitelist, $staticInitialised;
01471                 $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) );
01472 
01473                 if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
01474                         return $whitelist;
01475                 }
01476 
01477                 $common = array(
01478                         # HTML
01479                         'id',
01480                         'class',
01481                         'style',
01482                         'lang',
01483                         'dir',
01484                         'title',
01485 
01486                         # WAI-ARIA
01487                         'role',
01488                 );
01489 
01490                 if ( $wgAllowRdfaAttributes ) {
01491                         #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
01492                         $common = array_merge( $common, array(
01493                             'about', 'property', 'resource', 'datatype', 'typeof',
01494                         ) );
01495                 }
01496 
01497                 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
01498                         # add HTML5 microdata tags as specified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
01499                         $common = array_merge( $common, array(
01500                             'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
01501                         ) );
01502                 }
01503 
01504                 $block = array_merge( $common, array( 'align' ) );
01505                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
01506                 $tablecell = array( 'abbr',
01507                                     'axis',
01508                                     'headers',
01509                                     'scope',
01510                                     'rowspan',
01511                                     'colspan',
01512                                     'nowrap', # deprecated
01513                                     'width',  # deprecated
01514                                     'height', # deprecated
01515                                     'bgcolor' # deprecated
01516                                     );
01517 
01518                 # Numbers refer to sections in HTML 4.01 standard describing the element.
01519                 # See: http://www.w3.org/TR/html4/
01520                 $whitelist = array(
01521                         # 7.5.4
01522                         'div'        => $block,
01523                         'center'     => $common, # deprecated
01524                         'span'       => $block, # ??
01525 
01526                         # 7.5.5
01527                         'h1'         => $block,
01528                         'h2'         => $block,
01529                         'h3'         => $block,
01530                         'h4'         => $block,
01531                         'h5'         => $block,
01532                         'h6'         => $block,
01533 
01534                         # 7.5.6
01535                         # address
01536 
01537                         # 8.2.4
01538                         # bdo
01539 
01540                         # 9.2.1
01541                         'em'         => $common,
01542                         'strong'     => $common,
01543                         'cite'       => $common,
01544                         'dfn'        => $common,
01545                         'code'       => $common,
01546                         'samp'       => $common,
01547                         'kbd'        => $common,
01548                         'var'        => $common,
01549                         'abbr'       => $common,
01550                         # acronym
01551 
01552                         # 9.2.2
01553                         'blockquote' => array_merge( $common, array( 'cite' ) ),
01554                         # q
01555 
01556                         # 9.2.3
01557                         'sub'        => $common,
01558                         'sup'        => $common,
01559 
01560                         # 9.3.1
01561                         'p'          => $block,
01562 
01563                         # 9.3.2
01564                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
01565 
01566                         # 9.3.4
01567                         'pre'        => array_merge( $common, array( 'width' ) ),
01568 
01569                         # 9.4
01570                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01571                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01572 
01573                         # 10.2
01574                         'ul'         => array_merge( $common, array( 'type' ) ),
01575                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
01576                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
01577 
01578                         # 10.3
01579                         'dl'         => $common,
01580                         'dd'         => $common,
01581                         'dt'         => $common,
01582 
01583                         # 11.2.1
01584                         'table'      => array_merge( $common,
01585                                                                 array( 'summary', 'width', 'border', 'frame',
01586                                                                                 'rules', 'cellspacing', 'cellpadding',
01587                                                                                 'align', 'bgcolor',
01588                                                                 ) ),
01589 
01590                         # 11.2.2
01591                         'caption'    => array_merge( $common, array( 'align' ) ),
01592 
01593                         # 11.2.3
01594                         'thead'      => array_merge( $common, $tablealign ),
01595                         'tfoot'      => array_merge( $common, $tablealign ),
01596                         'tbody'      => array_merge( $common, $tablealign ),
01597 
01598                         # 11.2.4
01599                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01600                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01601 
01602                         # 11.2.5
01603                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01604 
01605                         # 11.2.6
01606                         'td'         => array_merge( $common, $tablecell, $tablealign ),
01607                         'th'         => array_merge( $common, $tablecell, $tablealign ),
01608 
01609                         # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
01610                         'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
01611 
01612                         # 13.2
01613                         # Not usually allowed, but may be used for extension-style hooks
01614                         # such as <math> when it is rasterized, or if $wgAllowImageTag is
01615                         # true
01616                         'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
01617 
01618                         # 15.2.1
01619                         'tt'         => $common,
01620                         'b'          => $common,
01621                         'i'          => $common,
01622                         'big'        => $common,
01623                         'small'      => $common,
01624                         'strike'     => $common,
01625                         's'          => $common,
01626                         'u'          => $common,
01627 
01628                         # 15.2.2
01629                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
01630                         # basefont
01631 
01632                         # 15.3
01633                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
01634 
01635                         # XHTML Ruby annotation text module, simple ruby only.
01636                         # http://www.w3c.org/TR/ruby/
01637                         'ruby'       => $common,
01638                         # rbc
01639                         # rtc
01640                         'rb'         => $common,
01641                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
01642                         'rp'         => $common,
01643 
01644                         # MathML root element, where used for extensions
01645                         # 'title' may not be 100% valid here; it's XHTML
01646                         # http://www.w3.org/TR/REC-MathML/
01647                         'math'       => array( 'class', 'style', 'id', 'title' ),
01648 
01649                         # HTML 5 section 4.6
01650                         'bdi' => $common,
01651 
01652                 );
01653 
01654                 if ( $wgHtml5 ) {
01655                         # HTML5 elements, defined by:
01656                         # http://www.whatwg.org/specs/web-apps/current-work/multipage/
01657                         $whitelist += array(
01658                                 'data' => array_merge( $common, array( 'value' ) ),
01659                                 'time' => array_merge( $common, array( 'datetime' ) ),
01660                                 'mark' => $common,
01661 
01662                                 // meta and link are only permitted by removeHTMLtags when Microdata
01663                                 // is enabled so we don't bother adding a conditional to hide these
01664                                 // Also meta and link are only valid in WikiText as Microdata elements
01665                                 // (ie: validateTag rejects tags missing the attributes needed for Microdata)
01666                                 // So we don't bother including $common attributes that have no purpose.
01667                                 'meta' => array( 'itemprop', 'content' ),
01668                                 'link' => array( 'itemprop', 'href' ),
01669                         );
01670                 }
01671 
01672                 $staticInitialised = $globalContext;
01673 
01674                 return $whitelist;
01675         }
01676 
01687         static function stripAllTags( $text ) {
01688                 # Actual <tags>
01689                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01690 
01691                 # Normalize &entities and whitespace
01692                 $text = self::decodeCharReferences( $text );
01693                 $text = self::normalizeWhitespace( $text );
01694 
01695                 return $text;
01696         }
01697 
01707         static function hackDocType() {
01708                 $out = "<!DOCTYPE html [\n";
01709                 foreach( self::$htmlEntities as $entity => $codepoint ) {
01710                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01711                 }
01712                 $out .= "]>\n";
01713                 return $out;
01714         }
01715 
01720         static function cleanUrl( $url ) {
01721                 # Normalize any HTML entities in input. They will be
01722                 # re-escaped by makeExternalLink().
01723                 $url = Sanitizer::decodeCharReferences( $url );
01724 
01725                 # Escape any control characters introduced by the above step
01726                 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
01727                         array( __CLASS__, 'cleanUrlCallback' ), $url );
01728 
01729                 # Validate hostname portion
01730                 $matches = array();
01731                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01732                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
01733 
01734                         // Characters that will be ignored in IDNs.
01735                         // http://tools.ietf.org/html/3454#section-3.1
01736                         // Strip them before further processing so blacklists and such work.
01737                         $strip = "/
01738                                 \\s|          # general whitespace
01739                                 \xc2\xad|     # 00ad SOFT HYPHEN
01740                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01741                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01742                                 \xe2\x81\xa0| # 2060 WORD JOINER
01743                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01744                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
01745                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01746                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01747                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01748                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01749                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01750                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
01751                                 /xuD";
01752 
01753                         $host = preg_replace( $strip, '', $host );
01754 
01755                         // @todo FIXME: Validate hostnames here
01756 
01757                         return $protocol . $host . $rest;
01758                 } else {
01759                         return $url;
01760                 }
01761         }
01762 
01767         static function cleanUrlCallback( $matches ) {
01768                 return urlencode( $matches[0] );
01769         }
01770 
01799         public static function validateEmail( $addr ) {
01800                 $result = null;
01801                 if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
01802                         return $result;
01803                 }
01804 
01805                 // Please note strings below are enclosed in brackets [], this make the
01806                 // hyphen "-" a range indicator. Hence it is double backslashed below.
01807                 // See bug 26948
01808                 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
01809                 $rfc1034_ldh_str = "a-z0-9\\-";
01810 
01811                 $HTML5_email_regexp = "/
01812                 ^                      # start of string
01813                 [$rfc5322_atext\\.]+    # user part which is liberal :p
01814                 @                      # 'apostrophe'
01815                 [$rfc1034_ldh_str]+       # First domain part
01816                 (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
01817                 $                      # End of string
01818                 /ix"; // case Insensitive, eXtended
01819 
01820                 return (bool) preg_match( $HTML5_email_regexp, $addr );
01821         }
01822 }