MediaWiki  REL1_19
Sanitizer.php
Go to the documentation of this file.
00001 <?php
00031 class Sanitizer {
00036         const CHAR_REFS_REGEX =
00037                 '/&([A-Za-z0-9\x80-\xff]+);
00038                  |&\#([0-9]+);
00039                  |&\#[xX]([0-9A-Fa-f]+);
00040                  |(&)/x';
00041 
00050         const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
00051         const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
00052 
00059         static $htmlEntities = array(
00060                 'Aacute'   => 193,
00061                 'aacute'   => 225,
00062                 'Acirc'    => 194,
00063                 'acirc'    => 226,
00064                 'acute'    => 180,
00065                 'AElig'    => 198,
00066                 'aelig'    => 230,
00067                 'Agrave'   => 192,
00068                 'agrave'   => 224,
00069                 'alefsym'  => 8501,
00070                 'Alpha'    => 913,
00071                 'alpha'    => 945,
00072                 'amp'      => 38,
00073                 'and'      => 8743,
00074                 'ang'      => 8736,
00075                 'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
00076                 'Aring'    => 197,
00077                 'aring'    => 229,
00078                 'asymp'    => 8776,
00079                 'Atilde'   => 195,
00080                 'atilde'   => 227,
00081                 'Auml'     => 196,
00082                 'auml'     => 228,
00083                 'bdquo'    => 8222,
00084                 'Beta'     => 914,
00085                 'beta'     => 946,
00086                 'brvbar'   => 166,
00087                 'bull'     => 8226,
00088                 'cap'      => 8745,
00089                 'Ccedil'   => 199,
00090                 'ccedil'   => 231,
00091                 'cedil'    => 184,
00092                 'cent'     => 162,
00093                 'Chi'      => 935,
00094                 'chi'      => 967,
00095                 'circ'     => 710,
00096                 'clubs'    => 9827,
00097                 'cong'     => 8773,
00098                 'copy'     => 169,
00099                 'crarr'    => 8629,
00100                 'cup'      => 8746,
00101                 'curren'   => 164,
00102                 'dagger'   => 8224,
00103                 'Dagger'   => 8225,
00104                 'darr'     => 8595,
00105                 'dArr'     => 8659,
00106                 'deg'      => 176,
00107                 'Delta'    => 916,
00108                 'delta'    => 948,
00109                 'diams'    => 9830,
00110                 'divide'   => 247,
00111                 'Eacute'   => 201,
00112                 'eacute'   => 233,
00113                 'Ecirc'    => 202,
00114                 'ecirc'    => 234,
00115                 'Egrave'   => 200,
00116                 'egrave'   => 232,
00117                 'empty'    => 8709,
00118                 'emsp'     => 8195,
00119                 'ensp'     => 8194,
00120                 'Epsilon'  => 917,
00121                 'epsilon'  => 949,
00122                 'equiv'    => 8801,
00123                 'Eta'      => 919,
00124                 'eta'      => 951,
00125                 'ETH'      => 208,
00126                 'eth'      => 240,
00127                 'Euml'     => 203,
00128                 'euml'     => 235,
00129                 'euro'     => 8364,
00130                 'exist'    => 8707,
00131                 'fnof'     => 402,
00132                 'forall'   => 8704,
00133                 'frac12'   => 189,
00134                 'frac14'   => 188,
00135                 'frac34'   => 190,
00136                 'frasl'    => 8260,
00137                 'Gamma'    => 915,
00138                 'gamma'    => 947,
00139                 'ge'       => 8805,
00140                 'gt'       => 62,
00141                 'harr'     => 8596,
00142                 'hArr'     => 8660,
00143                 'hearts'   => 9829,
00144                 'hellip'   => 8230,
00145                 'Iacute'   => 205,
00146                 'iacute'   => 237,
00147                 'Icirc'    => 206,
00148                 'icirc'    => 238,
00149                 'iexcl'    => 161,
00150                 'Igrave'   => 204,
00151                 'igrave'   => 236,
00152                 'image'    => 8465,
00153                 'infin'    => 8734,
00154                 'int'      => 8747,
00155                 'Iota'     => 921,
00156                 'iota'     => 953,
00157                 'iquest'   => 191,
00158                 'isin'     => 8712,
00159                 'Iuml'     => 207,
00160                 'iuml'     => 239,
00161                 'Kappa'    => 922,
00162                 'kappa'    => 954,
00163                 'Lambda'   => 923,
00164                 'lambda'   => 955,
00165                 'lang'     => 9001,
00166                 'laquo'    => 171,
00167                 'larr'     => 8592,
00168                 'lArr'     => 8656,
00169                 'lceil'    => 8968,
00170                 'ldquo'    => 8220,
00171                 'le'       => 8804,
00172                 'lfloor'   => 8970,
00173                 'lowast'   => 8727,
00174                 'loz'      => 9674,
00175                 'lrm'      => 8206,
00176                 'lsaquo'   => 8249,
00177                 'lsquo'    => 8216,
00178                 'lt'       => 60,
00179                 'macr'     => 175,
00180                 'mdash'    => 8212,
00181                 'micro'    => 181,
00182                 'middot'   => 183,
00183                 'minus'    => 8722,
00184                 'Mu'       => 924,
00185                 'mu'       => 956,
00186                 'nabla'    => 8711,
00187                 'nbsp'     => 160,
00188                 'ndash'    => 8211,
00189                 'ne'       => 8800,
00190                 'ni'       => 8715,
00191                 'not'      => 172,
00192                 'notin'    => 8713,
00193                 'nsub'     => 8836,
00194                 'Ntilde'   => 209,
00195                 'ntilde'   => 241,
00196                 'Nu'       => 925,
00197                 'nu'       => 957,
00198                 'Oacute'   => 211,
00199                 'oacute'   => 243,
00200                 'Ocirc'    => 212,
00201                 'ocirc'    => 244,
00202                 'OElig'    => 338,
00203                 'oelig'    => 339,
00204                 'Ograve'   => 210,
00205                 'ograve'   => 242,
00206                 'oline'    => 8254,
00207                 'Omega'    => 937,
00208                 'omega'    => 969,
00209                 'Omicron'  => 927,
00210                 'omicron'  => 959,
00211                 'oplus'    => 8853,
00212                 'or'       => 8744,
00213                 'ordf'     => 170,
00214                 'ordm'     => 186,
00215                 'Oslash'   => 216,
00216                 'oslash'   => 248,
00217                 'Otilde'   => 213,
00218                 'otilde'   => 245,
00219                 'otimes'   => 8855,
00220                 'Ouml'     => 214,
00221                 'ouml'     => 246,
00222                 'para'     => 182,
00223                 'part'     => 8706,
00224                 'permil'   => 8240,
00225                 'perp'     => 8869,
00226                 'Phi'      => 934,
00227                 'phi'      => 966,
00228                 'Pi'       => 928,
00229                 'pi'       => 960,
00230                 'piv'      => 982,
00231                 'plusmn'   => 177,
00232                 'pound'    => 163,
00233                 'prime'    => 8242,
00234                 'Prime'    => 8243,
00235                 'prod'     => 8719,
00236                 'prop'     => 8733,
00237                 'Psi'      => 936,
00238                 'psi'      => 968,
00239                 'quot'     => 34,
00240                 'radic'    => 8730,
00241                 'rang'     => 9002,
00242                 'raquo'    => 187,
00243                 'rarr'     => 8594,
00244                 'rArr'     => 8658,
00245                 'rceil'    => 8969,
00246                 'rdquo'    => 8221,
00247                 'real'     => 8476,
00248                 'reg'      => 174,
00249                 'rfloor'   => 8971,
00250                 'Rho'      => 929,
00251                 'rho'      => 961,
00252                 'rlm'      => 8207,
00253                 'rsaquo'   => 8250,
00254                 'rsquo'    => 8217,
00255                 'sbquo'    => 8218,
00256                 'Scaron'   => 352,
00257                 'scaron'   => 353,
00258                 'sdot'     => 8901,
00259                 'sect'     => 167,
00260                 'shy'      => 173,
00261                 'Sigma'    => 931,
00262                 'sigma'    => 963,
00263                 'sigmaf'   => 962,
00264                 'sim'      => 8764,
00265                 'spades'   => 9824,
00266                 'sub'      => 8834,
00267                 'sube'     => 8838,
00268                 'sum'      => 8721,
00269                 'sup'      => 8835,
00270                 'sup1'     => 185,
00271                 'sup2'     => 178,
00272                 'sup3'     => 179,
00273                 'supe'     => 8839,
00274                 'szlig'    => 223,
00275                 'Tau'      => 932,
00276                 'tau'      => 964,
00277                 'there4'   => 8756,
00278                 'Theta'    => 920,
00279                 'theta'    => 952,
00280                 'thetasym' => 977,
00281                 'thinsp'   => 8201,
00282                 'THORN'    => 222,
00283                 'thorn'    => 254,
00284                 'tilde'    => 732,
00285                 'times'    => 215,
00286                 'trade'    => 8482,
00287                 'Uacute'   => 218,
00288                 'uacute'   => 250,
00289                 'uarr'     => 8593,
00290                 'uArr'     => 8657,
00291                 'Ucirc'    => 219,
00292                 'ucirc'    => 251,
00293                 'Ugrave'   => 217,
00294                 'ugrave'   => 249,
00295                 'uml'      => 168,
00296                 'upsih'    => 978,
00297                 'Upsilon'  => 933,
00298                 'upsilon'  => 965,
00299                 'Uuml'     => 220,
00300                 'uuml'     => 252,
00301                 'weierp'   => 8472,
00302                 'Xi'       => 926,
00303                 'xi'       => 958,
00304                 'Yacute'   => 221,
00305                 'yacute'   => 253,
00306                 'yen'      => 165,
00307                 'Yuml'     => 376,
00308                 'yuml'     => 255,
00309                 'Zeta'     => 918,
00310                 'zeta'     => 950,
00311                 'zwj'      => 8205,
00312                 'zwnj'     => 8204
00313         );
00314 
00318         static $htmlEntityAliases = array(
00319                 'רלמ' => 'rlm',
00320                 'رلم' => 'rlm',
00321         );
00322 
00326         static $attribsRegex;
00327 
00333         static function getAttribsRegex() {
00334                 if ( self::$attribsRegex === null ) {
00335                         $attribFirst = '[:A-Z_a-z0-9]';
00336                         $attrib = '[:A-Z_a-z-.0-9]';
00337                         $space = '[\x09\x0a\x0d\x20]';
00338                         self::$attribsRegex =
00339                                 "/(?:^|$space)({$attribFirst}{$attrib}*)
00340                                   ($space*=$space*
00341                                         (?:
00342                                          # The attribute value: quoted or alone
00343                                           \"([^<\"]*)\"
00344                                          | '([^<']*)'
00345                                          |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00346                                          |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00347                                                                                  # colors are specified like this.
00348                                                                                  # We'll be normalizing it.
00349                                         )
00350                                 )?(?=$space|\$)/sx";
00351                 }
00352                 return self::$attribsRegex;
00353         }
00354 
00366         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
00367                 global $wgUseTidy;
00368 
00369                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00370                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
00371 
00372                 wfProfileIn( __METHOD__ );
00373 
00374                 if ( !$staticInitialised ) {
00375 
00376                         $htmlpairsStatic = array( # Tags that must be closed
00377                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00378                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00379                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
00380                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00381                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
00382                                 'kbd', 'samp'
00383                         );
00384                         $htmlsingle = array(
00385                                 'br', 'hr', 'li', 'dt', 'dd'
00386                         );
00387                         $htmlsingleonly = array( # Elements that cannot have close tags
00388                                 'br', 'hr'
00389                         );
00390                         $htmlnest = array( # Tags that can be nested--??
00391                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00392                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
00393                         );
00394                         $tabletags = array( # Can only appear inside table, we will close them
00395                                 'td', 'th', 'tr',
00396                         );
00397                         $htmllist = array( # Tags used by list
00398                                 'ul','ol',
00399                         );
00400                         $listtags = array( # Tags that can appear in a list
00401                                 'li',
00402                         );
00403 
00404                         global $wgAllowImageTag;
00405                         if ( $wgAllowImageTag ) {
00406                                 $htmlsingle[] = 'img';
00407                                 $htmlsingleonly[] = 'img';
00408                         }
00409 
00410                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
00411                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
00412 
00413                         # Convert them all to hashtables for faster lookup
00414                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00415                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
00416                         foreach ( $vars as $var ) {
00417                                 $$var = array_flip( $$var );
00418                         }
00419                         $staticInitialised = true;
00420                 }
00421                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
00422                 $extratags = array_flip( $extratags );
00423                 $removetags = array_flip( $removetags );
00424                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
00425                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
00426 
00427                 # Remove HTML comments
00428                 $text = Sanitizer::removeHTMLcomments( $text );
00429                 $bits = explode( '<', $text );
00430                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
00431                 if ( !$wgUseTidy ) {
00432                         $tagstack = $tablestack = array();
00433                         foreach ( $bits as $x ) {
00434                                 $regs = array();
00435                                 # $slash: Does the current element start with a '/'?
00436                                 # $t: Current element name
00437                                 # $params: String between element name and >
00438                                 # $brace: Ending '>' or '/>'
00439                                 # $rest: Everything until the next element of $bits
00440                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00441                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00442                                 } else {
00443                                         $slash = $t = $params = $brace = $rest = null;
00444                                 }
00445 
00446                                 $badtag = false;
00447                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00448                                         # Check our stack
00449                                         if ( $slash && isset( $htmlsingleonly[$t] ) ) {
00450                                                 $badtag = true;
00451                                         } elseif ( $slash ) {
00452                                                 # Closing a tag... is it the one we just opened?
00453                                                 $ot = @array_pop( $tagstack );
00454                                                 if ( $ot != $t ) {
00455                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
00456                                                                 # Pop all elements with an optional close tag
00457                                                                 # and see if we find a match below them
00458                                                                 $optstack = array();
00459                                                                 array_push( $optstack, $ot );
00460                                                                 wfSuppressWarnings();
00461                                                                 $ot = array_pop( $tagstack );
00462                                                                 wfRestoreWarnings();
00463                                                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
00464                                                                         array_push( $optstack, $ot );
00465                                                                         wfSuppressWarnings();
00466                                                                         $ot = array_pop( $tagstack );
00467                                                                         wfRestoreWarnings();
00468                                                                 }
00469                                                                 if ( $t != $ot ) {
00470                                                                         # No match. Push the optional elements back again
00471                                                                         $badtag = true;
00472                                                                         wfSuppressWarnings();
00473                                                                         $ot = array_pop( $optstack );
00474                                                                         wfRestoreWarnings();
00475                                                                         while ( $ot ) {
00476                                                                                 array_push( $tagstack, $ot );
00477                                                                                 wfSuppressWarnings();
00478                                                                                 $ot = array_pop( $optstack );
00479                                                                                 wfRestoreWarnings();
00480                                                                         }
00481                                                                 }
00482                                                         } else {
00483                                                                 @array_push( $tagstack, $ot );
00484                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
00485                                                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
00486                                                                         $badtag = true;
00487                                                                 }
00488                                                         }
00489                                                 } else {
00490                                                         if ( $t == 'table' ) {
00491                                                                 $tagstack = array_pop( $tablestack );
00492                                                         }
00493                                                 }
00494                                                 $newparams = '';
00495                                         } else {
00496                                                 # Keep track for later
00497                                                 if ( isset( $tabletags[$t] ) &&
00498                                                 !in_array( 'table', $tagstack ) ) {
00499                                                         $badtag = true;
00500                                                 } elseif ( in_array( $t, $tagstack ) &&
00501                                                 !isset( $htmlnest [$t ] ) ) {
00502                                                         $badtag = true;
00503                                                 # Is it a self closed htmlpair ? (bug 5487)
00504                                                 } elseif ( $brace == '/>' &&
00505                                                 isset( $htmlpairs[$t] ) ) {
00506                                                         $badtag = true;
00507                                                 } elseif ( isset( $htmlsingleonly[$t] ) ) {
00508                                                         # Hack to force empty tag for uncloseable elements
00509                                                         $brace = '/>';
00510                                                 } elseif ( isset( $htmlsingle[$t] ) ) {
00511                                                         # Hack to not close $htmlsingle tags
00512                                                         $brace = null;
00513                                                 } elseif ( isset( $tabletags[$t] )
00514                                                 && in_array( $t, $tagstack ) ) {
00515                                                         // New table tag but forgot to close the previous one
00516                                                         $text .= "</$t>";
00517                                                 } else {
00518                                                         if ( $t == 'table' ) {
00519                                                                 array_push( $tablestack, $tagstack );
00520                                                                 $tagstack = array();
00521                                                         }
00522                                                         array_push( $tagstack, $t );
00523                                                 }
00524 
00525                                                 # Replace any variables or template parameters with
00526                                                 # plaintext results.
00527                                                 if( is_callable( $processCallback ) ) {
00528                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
00529                                                 }
00530 
00531                                                 # Strip non-approved attributes from the tag
00532                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00533                                         }
00534                                         if ( !$badtag ) {
00535                                                 $rest = str_replace( '>', '&gt;', $rest );
00536                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00537                                                 $text .= "<$slash$t$newparams$close>$rest";
00538                                                 continue;
00539                                         }
00540                                 }
00541                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00542                         }
00543                         # Close off any remaining tags
00544                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
00545                                 $text .= "</$t>\n";
00546                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
00547                         }
00548                 } else {
00549                         # this might be possible using tidy itself
00550                         foreach ( $bits as $x ) {
00551                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00552                                 $x, $regs );
00553                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00554                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00555                                         if( is_callable( $processCallback ) ) {
00556                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
00557                                         }
00558                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
00559                                         $rest = str_replace( '>', '&gt;', $rest );
00560                                         $text .= "<$slash$t$newparams$brace$rest";
00561                                 } else {
00562                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00563                                 }
00564                         }
00565                 }
00566                 wfProfileOut( __METHOD__ );
00567                 return $text;
00568         }
00569 
00580         static function removeHTMLcomments( $text ) {
00581                 wfProfileIn( __METHOD__ );
00582                 while (($start = strpos($text, '<!--')) !== false) {
00583                         $end = strpos($text, '-->', $start + 4);
00584                         if ($end === false) {
00585                                 # Unterminated comment; bail out
00586                                 break;
00587                         }
00588 
00589                         $end += 3;
00590 
00591                         # Trim space and newline if the comment is both
00592                         # preceded and followed by a newline
00593                         $spaceStart = max($start - 1, 0);
00594                         $spaceLen = $end - $spaceStart;
00595                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
00596                                 $spaceStart--;
00597                                 $spaceLen++;
00598                         }
00599                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
00600                                 $spaceLen++;
00601                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
00602                                 # Remove the comment, leading and trailing
00603                                 # spaces, and leave only one newline.
00604                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
00605                         }
00606                         else {
00607                                 # Remove just the comment.
00608                                 $text = substr_replace($text, '', $start, $end - $start);
00609                         }
00610                 }
00611                 wfProfileOut( __METHOD__ );
00612                 return $text;
00613         }
00614 
00628         static function fixDeprecatedAttributes( $attribs, $element ) {
00629                 global $wgHtml5, $wgCleanupPresentationalAttributes;
00630 
00631                 // presentational attributes were removed from html5, we can leave them
00632                 // in when html5 is turned off
00633                 if ( !$wgHtml5 || !$wgCleanupPresentationalAttributes ) {
00634                         return $attribs;
00635                 }
00636 
00637                 $table = array( 'table' );
00638                 $cells = array( 'td', 'th' );
00639                 $colls = array( 'col', 'colgroup' );
00640                 $tblocks = array( 'tbody', 'tfoot', 'thead' );
00641                 $h = array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' );
00642 
00643                 $presentationalAttribs = array(
00644                         'align' => array( 'text-align', array_merge( array( 'caption', 'hr', 'div', 'p', 'tr' ), $table, $cells, $colls, $tblocks, $h ) ),
00645                         'clear' => array( 'clear', array( 'br' ) ),
00646                         'height' => array( 'height', $cells ),
00647                         'nowrap' => array( 'white-space', $cells ),
00648                         'size' => array( 'height', array( 'hr' ) ),
00649                         'type' => array( 'list-style-type', array( 'li', 'ol', 'ul' ) ),
00650                         'valign' => array( 'vertical-align', array_merge( $cells, $colls, $tblocks ) ),
00651                         'width' => array( 'width', array_merge( array( 'hr', 'pre' ), $table, $cells, $colls ) ),
00652                 );
00653 
00654                 // Ensure that any upper case or mixed case attributes are converted to lowercase
00655                 foreach ( $attribs as $attribute => $value ) {
00656                         if ( $attribute !== strtolower( $attribute ) && array_key_exists( strtolower( $attribute ), $presentationalAttribs ) ) {
00657                                 $attribs[strtolower( $attribute )] = $value;
00658                                 unset( $attribs[$attribute] );
00659                         }
00660                 }
00661 
00662                 $style = "";
00663                 foreach ( $presentationalAttribs as $attribute => $info ) {
00664                         list( $property, $elements ) = $info;
00665 
00666                         // Skip if this attribute is not relevant to this element
00667                         if ( !in_array( $element, $elements ) ) {
00668                                 continue;
00669                         }
00670 
00671                         // Skip if the attribute is not used
00672                         if ( !array_key_exists( $attribute, $attribs ) ) {
00673                                 continue;
00674                         }
00675 
00676                         $value = $attribs[$attribute];
00677 
00678                         // For nowrap the value should be nowrap instead of whatever text is in the value
00679                         if ( $attribute === 'nowrap' ) {
00680                                 $value = 'nowrap';
00681                         }
00682 
00683                         // clear="all" is clear: both; in css
00684                         if ( $attribute === 'clear' && strtolower( $value ) === 'all' ) {
00685                                 $value = 'both';
00686                         }
00687 
00688                         // Size based properties should have px applied to them if they have no unit
00689                         if ( in_array( $attribute, array( 'height', 'width', 'size' ) ) ) {
00690                                 if ( preg_match( '/^[\d.]+$/', $value ) ) {
00691                                         $value = "{$value}px";
00692                                 }
00693                         }
00694 
00695                         $style .= " $property: $value;";
00696 
00697                         unset( $attribs[$attribute] );
00698                 }
00699 
00700                 if ( $style ) {
00701                         // Prepend our style rules so that they can be overridden by user css
00702                         if ( isset($attribs['style']) ) {
00703                                 $style .= " " . $attribs['style'];
00704                         }
00705                         $attribs['style'] = trim($style);
00706                 }
00707 
00708                 return $attribs;
00709         }
00710 
00726         static function validateTagAttributes( $attribs, $element ) {
00727                 return Sanitizer::validateAttributes( $attribs,
00728                         Sanitizer::attributeWhitelist( $element ) );
00729         }
00730 
00746         static function validateAttributes( $attribs, $whitelist ) {
00747                 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
00748 
00749                 $whitelist = array_flip( $whitelist );
00750                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
00751 
00752                 $out = array();
00753                 foreach( $attribs as $attribute => $value ) {
00754                         #allow XML namespace declaration if RDFa is enabled
00755                         if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
00756                                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00757                                         $out[$attribute] = $value;
00758                                 }
00759 
00760                                 continue;
00761                         }
00762 
00763                         # Allow any attribute beginning with "data-", if in HTML5 mode
00764                         if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
00765                                 continue;
00766                         }
00767 
00768                         # Strip javascript "expression" from stylesheets.
00769                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00770                         if( $attribute == 'style' ) {
00771                                 $value = Sanitizer::checkCss( $value );
00772                         }
00773 
00774                         if ( $attribute === 'id' ) {
00775                                 $value = Sanitizer::escapeId( $value, 'noninitial' );
00776                         }
00777 
00778                         //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
00779                         if ( $attribute === 'rel' || $attribute === 'rev' ||
00780                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
00781                                 $attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
00782                                 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
00783                                 $attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
00784 
00785                                 //Paranoia. Allow "simple" values but suppress javascript
00786                                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00787                                         continue;
00788                                 }
00789                         }
00790 
00791                         # NOTE: even though elements using href/src are not allowed directly, supply
00792                         #       validation code that can be used by tag hook handlers, etc
00793                         if ( $attribute === 'href' || $attribute === 'src' ) {
00794                                 if ( !preg_match( $hrefExp, $value ) ) {
00795                                         continue; //drop any href or src attributes not using an allowed protocol.
00796                                                   //NOTE: this also drops all relative URLs
00797                                 }
00798                         }
00799 
00800                         // If this attribute was previously set, override it.
00801                         // Output should only have one attribute of each name.
00802                         $out[$attribute] = $value;
00803                 }
00804 
00805                 if ( $wgAllowMicrodataAttributes ) {
00806                         # itemtype, itemid, itemref don't make sense without itemscope
00807                         if ( !array_key_exists( 'itemscope', $out ) ) {
00808                                 unset( $out['itemtype'] );
00809                                 unset( $out['itemid'] );
00810                                 unset( $out['itemref'] );
00811                         }
00812                         # TODO: Strip itemprop if we aren't descendants of an itemscope.
00813                 }
00814                 return $out;
00815         }
00816 
00827         static function mergeAttributes( $a, $b ) {
00828                 $out = array_merge( $a, $b );
00829                 if( isset( $a['class'] ) && isset( $b['class'] )
00830                 && is_string( $a['class'] ) && is_string( $b['class'] )
00831                 && $a['class'] !== $b['class'] ) {
00832                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00833                                 -1, PREG_SPLIT_NO_EMPTY );
00834                         $out['class'] = implode( ' ', array_unique( $classes ) );
00835                 }
00836                 return $out;
00837         }
00838 
00848         public static function normalizeCss( $value ) {
00849 
00850                 // Decode character references like &#123;
00851                 $value = Sanitizer::decodeCharReferences( $value );
00852 
00853                 // Decode escape sequences and line continuation
00854                 // See the grammar in the CSS 2 spec, appendix D.
00855                 // This has to be done AFTER decoding character references.
00856                 // This means it isn't possible for this function to return
00857                 // unsanitized escape sequences. It is possible to manufacture
00858                 // input that contains character references that decode to
00859                 // escape sequences that decode to character references, but
00860                 // it's OK for the return value to contain character references
00861                 // because the caller is supposed to escape those anyway.
00862                 static $decodeRegex;
00863                 if ( !$decodeRegex ) {
00864                         $space = '[\\x20\\t\\r\\n\\f]';
00865                         $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00866                         $backslash = '\\\\';
00867                         $decodeRegex = "/ $backslash
00868                                 (?:
00869                                         ($nl) |  # 1. Line continuation
00870                                         ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
00871                                         (.) | # 3. backslash cancelling special meaning
00872                                         () | # 4. backslash at end of string
00873                                 )/xu";
00874                 }
00875                 $value = preg_replace_callback( $decodeRegex,
00876                         array( __CLASS__, 'cssDecodeCallback' ), $value );
00877 
00878                 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
00879                 $value = preg_replace_callback(
00880                         '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
00881                         array( __CLASS__, 'cssNormalizeUnicodeWidth' ),
00882                         $value
00883                 );
00884 
00885                 // Convert more characters IE6 might treat as ascii
00886                 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
00887                 $value = str_replace(
00888                         array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
00889                         array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
00890                         $value
00891                 );
00892 
00893                 // Remove any comments; IE gets token splitting wrong
00894                 // This must be done AFTER decoding character references and
00895                 // escape sequences, because those steps can introduce comments
00896                 // This step cannot introduce character references or escape
00897                 // sequences, because it replaces comments with spaces rather
00898                 // than removing them completely.
00899                 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00900 
00901                 // Remove anything after a comment-start token, to guard against
00902                 // incorrect client implementations.
00903                 $commentPos = strpos( $value, '/*' );
00904                 if ( $commentPos !== false ) {
00905                         $value = substr( $value, 0, $commentPos );
00906                 }
00907 
00908                 // S followed by repeat, iteration, or prolonged sound marks,
00909                 // which IE will treat as "ss"
00910                 $value = preg_replace(
00911                         '/s(?:
00912                                 \xE3\x80\xB1 | # U+3031
00913                                 \xE3\x82\x9D | # U+309D
00914                                 \xE3\x83\xBC | # U+30FC
00915                                 \xE3\x83\xBD | # U+30FD
00916                                 \xEF\xB9\xBC | # U+FE7C
00917                                 \xEF\xB9\xBD | # U+FE7D
00918                                 \xEF\xBD\xB0   # U+FF70
00919                         )/ix',
00920                         'ss',
00921                         $value
00922                 );
00923 
00924                 return $value;
00925         }
00926 
00927 
00946         static function checkCss( $value ) {
00947                 $value = self::normalizeCss( $value );
00948 
00949                 // Reject problematic keywords and control characters
00950                 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
00951                         return '/* invalid control char */';
00952                 } elseif ( preg_match(
00953                         '! expression
00954                                 | filter\s*:
00955                                 | accelerator\s*:
00956                                 | -o-link\s*:
00957                                 | -o-link-source\s*:
00958                                 | -o-replace\s*:
00959                                 | url\s*\(
00960                                 | image\s*\(
00961                                 | image-set\s*\(
00962                         !ix', $value ) ) {                      return '/* insecure input */';
00963                 }
00964                 return $value;
00965         }
00966 
00972         static function cssNormalizeUnicodeWidth( $matches ) {
00973                 $cp = utf8ToCodepoint( $matches[0] );
00974                 if ( $cp === false ) {
00975                         return '';
00976                 }
00977                 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
00978         }
00979 
00984         static function cssDecodeCallback( $matches ) {
00985                 if ( $matches[1] !== '' ) {
00986                         // Line continuation
00987                         return '';
00988                 } elseif ( $matches[2] !== '' ) {
00989                         $char = codepointToUtf8( hexdec( $matches[2] ) );
00990                 } elseif ( $matches[3] !== '' ) {
00991                         $char = $matches[3];
00992                 } else {
00993                         $char = '\\';
00994                 }
00995                 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00996                         // These characters need to be escaped in strings
00997                         // Clean up the escape sequence to avoid parsing errors by clients
00998                         return '\\' . dechex( ord( $char ) ) . ' ';
00999                 } else {
01000                         // Decode unnecessary escape
01001                         return $char;
01002                 }
01003         }
01004 
01024         static function fixTagAttributes( $text, $element ) {
01025                 if( trim( $text ) == '' ) {
01026                         return '';
01027                 }
01028 
01029                 $decoded = Sanitizer::decodeTagAttributes( $text );
01030                 $decoded = Sanitizer::fixDeprecatedAttributes( $decoded, $element );
01031                 $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
01032 
01033                 $attribs = array();
01034                 foreach( $stripped as $attribute => $value ) {
01035                         $encAttribute = htmlspecialchars( $attribute );
01036                         $encValue = Sanitizer::safeEncodeAttribute( $value );
01037 
01038                         $attribs[] = "$encAttribute=\"$encValue\"";
01039                 }
01040                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
01041         }
01042 
01048         static function encodeAttribute( $text ) {
01049                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
01050 
01051                 // Whitespace is normalized during attribute decoding,
01052                 // so if we've been passed non-spaces we must encode them
01053                 // ahead of time or they won't be preserved.
01054                 $encValue = strtr( $encValue, array(
01055                         "\n" => '&#10;',
01056                         "\r" => '&#13;',
01057                         "\t" => '&#9;',
01058                 ) );
01059 
01060                 return $encValue;
01061         }
01062 
01069         static function safeEncodeAttribute( $text ) {
01070                 $encValue = Sanitizer::encodeAttribute( $text );
01071 
01072                 # Templates and links may be expanded in later parsing,
01073                 # creating invalid or dangerous output. Suppress this.
01074                 $encValue = strtr( $encValue, array(
01075                         '<'    => '&lt;',   // This should never happen,
01076                         '>'    => '&gt;',   // we've received invalid input
01077                         '"'    => '&quot;', // which should have been escaped.
01078                         '{'    => '&#123;',
01079                         '['    => '&#91;',
01080                         "''"   => '&#39;&#39;',
01081                         'ISBN' => '&#73;SBN',
01082                         'RFC'  => '&#82;FC',
01083                         'PMID' => '&#80;MID',
01084                         '|'    => '&#124;',
01085                         '__'   => '&#95;_',
01086                 ) );
01087 
01088                 # Stupid hack
01089                 $encValue = preg_replace_callback(
01090                         '/(' . wfUrlProtocols() . ')/',
01091                         array( 'Sanitizer', 'armorLinksCallback' ),
01092                         $encValue );
01093                 return $encValue;
01094         }
01095 
01127         static function escapeId( $id, $options = array() ) {
01128                 global $wgHtml5, $wgExperimentalHtmlIds;
01129                 $options = (array)$options;
01130 
01131                 if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
01132                         $id = Sanitizer::decodeCharReferences( $id );
01133                         $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
01134                         $id = trim( $id, '_' );
01135                         if ( $id === '' ) {
01136                                 # Must have been all whitespace to start with.
01137                                 return '_';
01138                         } else {
01139                                 return $id;
01140                         }
01141                 }
01142 
01143                 # HTML4-style escaping
01144                 static $replace = array(
01145                         '%3A' => ':',
01146                         '%' => '.'
01147                 );
01148 
01149                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
01150                 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
01151 
01152                 if ( !preg_match( '/^[a-zA-Z]/', $id )
01153                 && !in_array( 'noninitial', $options ) )  {
01154                         // Initial character must be a letter!
01155                         $id = "x$id";
01156                 }
01157                 return $id;
01158         }
01159 
01171         static function escapeClass( $class ) {
01172                 // Convert ugly stuff to underscores and kill underscores in ugly places
01173                 return rtrim(preg_replace(
01174                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
01175                         '_',
01176                         $class ), '_');
01177         }
01178 
01186         static function escapeHtmlAllowEntities( $html ) {
01187                 $html = Sanitizer::decodeCharReferences( $html );
01188                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
01189                 # hurt.
01190                 $html = htmlspecialchars( $html, ENT_QUOTES );
01191                 return $html;
01192         }
01193 
01199         private static function armorLinksCallback( $matches ) {
01200                 return str_replace( ':', '&#58;', $matches[1] );
01201         }
01202 
01211         public static function decodeTagAttributes( $text ) {
01212                 if( trim( $text ) == '' ) {
01213                         return array();
01214                 }
01215 
01216                 $attribs = array();
01217                 $pairs = array();
01218                 if( !preg_match_all(
01219                         self::getAttribsRegex(),
01220                         $text,
01221                         $pairs,
01222                         PREG_SET_ORDER ) ) {
01223                         return $attribs;
01224                 }
01225 
01226                 foreach( $pairs as $set ) {
01227                         $attribute = strtolower( $set[1] );
01228                         $value = Sanitizer::getTagAttributeCallback( $set );
01229 
01230                         // Normalize whitespace
01231                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
01232                         $value = trim( $value );
01233 
01234                         // Decode character references
01235                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
01236                 }
01237                 return $attribs;
01238         }
01239 
01247         private static function getTagAttributeCallback( $set ) {
01248                 if( isset( $set[6] ) ) {
01249                         # Illegal #XXXXXX color with no quotes.
01250                         return $set[6];
01251                 } elseif( isset( $set[5] ) ) {
01252                         # No quotes.
01253                         return $set[5];
01254                 } elseif( isset( $set[4] ) ) {
01255                         # Single-quoted
01256                         return $set[4];
01257                 } elseif( isset( $set[3] ) ) {
01258                         # Double-quoted
01259                         return $set[3];
01260                 } elseif( !isset( $set[2] ) ) {
01261                         # In XHTML, attributes must have a value.
01262                         # For 'reduced' form, return explicitly the attribute name here.
01263                         return $set[1];
01264                 } else {
01265                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
01266                 }
01267         }
01268 
01280         private static function normalizeAttributeValue( $text ) {
01281                 return str_replace( '"', '&quot;',
01282                         self::normalizeWhitespace(
01283                                 Sanitizer::normalizeCharReferences( $text ) ) );
01284         }
01285 
01290         private static function normalizeWhitespace( $text ) {
01291                 return preg_replace(
01292                         '/\r\n|[\x20\x0d\x0a\x09]/',
01293                         ' ',
01294                         $text );
01295         }
01296 
01305         static function normalizeSectionNameWhitespace( $section ) {
01306                 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
01307         }
01308 
01324         static function normalizeCharReferences( $text ) {
01325                 return preg_replace_callback(
01326                         self::CHAR_REFS_REGEX,
01327                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01328                         $text );
01329         }
01334         static function normalizeCharReferencesCallback( $matches ) {
01335                 $ret = null;
01336                 if( $matches[1] != '' ) {
01337                         $ret = Sanitizer::normalizeEntity( $matches[1] );
01338                 } elseif( $matches[2] != '' ) {
01339                         $ret = Sanitizer::decCharReference( $matches[2] );
01340                 } elseif( $matches[3] != ''  ) {
01341                         $ret = Sanitizer::hexCharReference( $matches[3] );
01342                 }
01343                 if( is_null( $ret ) ) {
01344                         return htmlspecialchars( $matches[0] );
01345                 } else {
01346                         return $ret;
01347                 }
01348         }
01349 
01360         static function normalizeEntity( $name ) {
01361                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
01362                         return '&' . self::$htmlEntityAliases[$name] . ';';
01363                 } elseif ( in_array( $name,
01364                 array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
01365                         return "&$name;";
01366                 } elseif ( isset( self::$htmlEntities[$name] ) ) {
01367                         return '&#' . self::$htmlEntities[$name] . ';';
01368                 } else {
01369                         return "&amp;$name;";
01370                 }
01371         }
01372 
01377         static function decCharReference( $codepoint ) {
01378                 $point = intval( $codepoint );
01379                 if( Sanitizer::validateCodepoint( $point ) ) {
01380                         return sprintf( '&#%d;', $point );
01381                 } else {
01382                         return null;
01383                 }
01384         }
01385 
01390         static function hexCharReference( $codepoint ) {
01391                 $point = hexdec( $codepoint );
01392                 if( Sanitizer::validateCodepoint( $point ) ) {
01393                         return sprintf( '&#x%x;', $point );
01394                 } else {
01395                         return null;
01396                 }
01397         }
01398 
01404         private static function validateCodepoint( $codepoint ) {
01405                 return ($codepoint ==    0x09)
01406                         || ($codepoint ==    0x0a)
01407                         || ($codepoint ==    0x0d)
01408                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
01409                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
01410                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
01411         }
01412 
01420         public static function decodeCharReferences( $text ) {
01421                 return preg_replace_callback(
01422                         self::CHAR_REFS_REGEX,
01423                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01424                         $text );
01425         }
01426 
01437         public static function decodeCharReferencesAndNormalize( $text ) {
01438                 global $wgContLang;
01439                 $text = preg_replace_callback(
01440                         self::CHAR_REFS_REGEX,
01441                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01442                         $text, /* limit */ -1, $count );
01443 
01444                 if ( $count ) {
01445                         return $wgContLang->normalize( $text );
01446                 } else {
01447                         return $text;
01448                 }
01449         }
01450 
01455         static function decodeCharReferencesCallback( $matches ) {
01456                 if( $matches[1] != '' ) {
01457                         return Sanitizer::decodeEntity( $matches[1] );
01458                 } elseif( $matches[2] != '' ) {
01459                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
01460                 } elseif( $matches[3] != ''  ) {
01461                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
01462                 }
01463                 # Last case should be an ampersand by itself
01464                 return $matches[0];
01465         }
01466 
01474         static function decodeChar( $codepoint ) {
01475                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
01476                         return codepointToUtf8( $codepoint );
01477                 } else {
01478                         return UTF8_REPLACEMENT;
01479                 }
01480         }
01481 
01490         static function decodeEntity( $name ) {
01491                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
01492                         $name = self::$htmlEntityAliases[$name];
01493                 }
01494                 if( isset( self::$htmlEntities[$name] ) ) {
01495                         return codepointToUtf8( self::$htmlEntities[$name] );
01496                 } else {
01497                         return "&$name;";
01498                 }
01499         }
01500 
01507         static function attributeWhitelist( $element ) {
01508                 static $list;
01509                 if( !isset( $list ) ) {
01510                         $list = Sanitizer::setupAttributeWhitelist();
01511                 }
01512                 return isset( $list[$element] )
01513                         ? $list[$element]
01514                         : array();
01515         }
01516 
01522         static function setupAttributeWhitelist() {
01523                 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
01524 
01525                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
01526 
01527                 if ( $wgAllowRdfaAttributes ) {
01528                         #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
01529                         $common = array_merge( $common, array(
01530                             'about', 'property', 'resource', 'datatype', 'typeof',
01531                         ) );
01532                 }
01533 
01534                 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
01535                         # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
01536                         $common = array_merge( $common, array(
01537                             'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
01538                         ) );
01539                 }
01540 
01541                 $block = array_merge( $common, array( 'align' ) );
01542                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
01543                 $tablecell = array( 'abbr',
01544                                     'axis',
01545                                     'headers',
01546                                     'scope',
01547                                     'rowspan',
01548                                     'colspan',
01549                                     'nowrap', # deprecated
01550                                     'width',  # deprecated
01551                                     'height', # deprecated
01552                                     'bgcolor' # deprecated
01553                                     );
01554 
01555                 # Numbers refer to sections in HTML 4.01 standard describing the element.
01556                 # See: http://www.w3.org/TR/html4/
01557                 $whitelist = array (
01558                         # 7.5.4
01559                         'div'        => $block,
01560                         'center'     => $common, # deprecated
01561                         'span'       => $block, # ??
01562 
01563                         # 7.5.5
01564                         'h1'         => $block,
01565                         'h2'         => $block,
01566                         'h3'         => $block,
01567                         'h4'         => $block,
01568                         'h5'         => $block,
01569                         'h6'         => $block,
01570 
01571                         # 7.5.6
01572                         # address
01573 
01574                         # 8.2.4
01575                         # bdo
01576 
01577                         # 9.2.1
01578                         'em'         => $common,
01579                         'strong'     => $common,
01580                         'cite'       => $common,
01581                         'dfn'        => $common,
01582                         'code'       => $common,
01583                         'samp'       => $common,
01584                         'kbd'        => $common,
01585                         'var'        => $common,
01586                         'abbr'       => $common,
01587                         # acronym
01588 
01589                         # 9.2.2
01590                         'blockquote' => array_merge( $common, array( 'cite' ) ),
01591                         # q
01592 
01593                         # 9.2.3
01594                         'sub'        => $common,
01595                         'sup'        => $common,
01596 
01597                         # 9.3.1
01598                         'p'          => $block,
01599 
01600                         # 9.3.2
01601                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
01602 
01603                         # 9.3.4
01604                         'pre'        => array_merge( $common, array( 'width' ) ),
01605 
01606                         # 9.4
01607                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01608                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01609 
01610                         # 10.2
01611                         'ul'         => array_merge( $common, array( 'type' ) ),
01612                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
01613                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
01614 
01615                         # 10.3
01616                         'dl'         => $common,
01617                         'dd'         => $common,
01618                         'dt'         => $common,
01619 
01620                         # 11.2.1
01621                         'table'      => array_merge( $common,
01622                                                                 array( 'summary', 'width', 'border', 'frame',
01623                                                                                 'rules', 'cellspacing', 'cellpadding',
01624                                                                                 'align', 'bgcolor',
01625                                                                 ) ),
01626 
01627                         # 11.2.2
01628                         'caption'    => array_merge( $common, array( 'align' ) ),
01629 
01630                         # 11.2.3
01631                         'thead'      => array_merge( $common, $tablealign ),
01632                         'tfoot'      => array_merge( $common, $tablealign ),
01633                         'tbody'      => array_merge( $common, $tablealign ),
01634 
01635                         # 11.2.4
01636                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01637                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01638 
01639                         # 11.2.5
01640                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01641 
01642                         # 11.2.6
01643                         'td'         => array_merge( $common, $tablecell, $tablealign ),
01644                         'th'         => array_merge( $common, $tablecell, $tablealign ),
01645 
01646                         # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
01647                         'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
01648 
01649                         # 13.2
01650                         # Not usually allowed, but may be used for extension-style hooks
01651                         # such as <math> when it is rasterized, or if $wgAllowImageTag is
01652                         # true
01653                         'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
01654 
01655                         # 15.2.1
01656                         'tt'         => $common,
01657                         'b'          => $common,
01658                         'i'          => $common,
01659                         'big'        => $common,
01660                         'small'      => $common,
01661                         'strike'     => $common,
01662                         's'          => $common,
01663                         'u'          => $common,
01664 
01665                         # 15.2.2
01666                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
01667                         # basefont
01668 
01669                         # 15.3
01670                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
01671 
01672                         # XHTML Ruby annotation text module, simple ruby only.
01673                         # http://www.w3c.org/TR/ruby/
01674                         'ruby'       => $common,
01675                         # rbc
01676                         # rtc
01677                         'rb'         => $common,
01678                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
01679                         'rp'         => $common,
01680 
01681                         # MathML root element, where used for extensions
01682                         # 'title' may not be 100% valid here; it's XHTML
01683                         # http://www.w3.org/TR/REC-MathML/
01684                         'math'       => array( 'class', 'style', 'id', 'title' ),
01685                         );
01686                 return $whitelist;
01687         }
01688 
01699         static function stripAllTags( $text ) {
01700                 # Actual <tags>
01701                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01702 
01703                 # Normalize &entities and whitespace
01704                 $text = self::decodeCharReferences( $text );
01705                 $text = self::normalizeWhitespace( $text );
01706 
01707                 return $text;
01708         }
01709 
01719         static function hackDocType() {
01720                 $out = "<!DOCTYPE html [\n";
01721                 foreach( self::$htmlEntities as $entity => $codepoint ) {
01722                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01723                 }
01724                 $out .= "]>\n";
01725                 return $out;
01726         }
01727 
01732         static function cleanUrl( $url ) {
01733                 # Normalize any HTML entities in input. They will be
01734                 # re-escaped by makeExternalLink().
01735                 $url = Sanitizer::decodeCharReferences( $url );
01736 
01737                 # Escape any control characters introduced by the above step
01738                 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
01739                         array( __CLASS__, 'cleanUrlCallback' ), $url );
01740 
01741                 # Validate hostname portion
01742                 $matches = array();
01743                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01744                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
01745 
01746                         // Characters that will be ignored in IDNs.
01747                         // http://tools.ietf.org/html/3454#section-3.1
01748                         // Strip them before further processing so blacklists and such work.
01749                         $strip = "/
01750                                 \\s|          # general whitespace
01751                                 \xc2\xad|     # 00ad SOFT HYPHEN
01752                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01753                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01754                                 \xe2\x81\xa0| # 2060 WORD JOINER
01755                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01756                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
01757                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01758                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01759                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01760                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01761                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01762                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
01763                                 /xuD";
01764 
01765                         $host = preg_replace( $strip, '', $host );
01766 
01767                         // @todo FIXME: Validate hostnames here
01768 
01769                         return $protocol . $host . $rest;
01770                 } else {
01771                         return $url;
01772                 }
01773         }
01774 
01779         static function cleanUrlCallback( $matches ) {
01780                 return urlencode( $matches[0] );
01781         }
01782 
01811         public static function validateEmail( $addr ) {
01812                 $result = null;
01813                 if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
01814                         return $result;
01815                 }
01816 
01817                 // Please note strings below are enclosed in brackets [], this make the
01818                 // hyphen "-" a range indicator. Hence it is double backslashed below.
01819                 // See bug 26948
01820                 $rfc5322_atext   = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ;
01821                 $rfc1034_ldh_str = "a-z0-9\\-" ;
01822 
01823                 $HTML5_email_regexp = "/
01824                 ^                      # start of string
01825                 [$rfc5322_atext\\.]+    # user part which is liberal :p
01826                 @                      # 'apostrophe'
01827                 [$rfc1034_ldh_str]+       # First domain part
01828                 (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
01829                 $                      # End of string
01830                 /ix" ; // case Insensitive, eXtended
01831 
01832                 return (bool) preg_match( $HTML5_email_regexp, $addr );
01833         }
01834 }