MediaWiki  REL1_24
Sanitizer.php
Go to the documentation of this file.
00001 <?php
00031 class Sanitizer {
00036     const CHAR_REFS_REGEX =
00037         '/&([A-Za-z0-9\x80-\xff]+);
00038          |&\#([0-9]+);
00039          |&\#[xX]([0-9A-Fa-f]+);
00040          |(&)/x';
00041 
00050     const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
00051     const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
00052 
00058     private static $htmlEntities = array(
00059         'Aacute'   => 193,
00060         'aacute'   => 225,
00061         'Acirc'    => 194,
00062         'acirc'    => 226,
00063         'acute'    => 180,
00064         'AElig'    => 198,
00065         'aelig'    => 230,
00066         'Agrave'   => 192,
00067         'agrave'   => 224,
00068         'alefsym'  => 8501,
00069         'Alpha'    => 913,
00070         'alpha'    => 945,
00071         'amp'      => 38,
00072         'and'      => 8743,
00073         'ang'      => 8736,
00074         'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
00075         'Aring'    => 197,
00076         'aring'    => 229,
00077         'asymp'    => 8776,
00078         'Atilde'   => 195,
00079         'atilde'   => 227,
00080         'Auml'     => 196,
00081         'auml'     => 228,
00082         'bdquo'    => 8222,
00083         'Beta'     => 914,
00084         'beta'     => 946,
00085         'brvbar'   => 166,
00086         'bull'     => 8226,
00087         'cap'      => 8745,
00088         'Ccedil'   => 199,
00089         'ccedil'   => 231,
00090         'cedil'    => 184,
00091         'cent'     => 162,
00092         'Chi'      => 935,
00093         'chi'      => 967,
00094         'circ'     => 710,
00095         'clubs'    => 9827,
00096         'cong'     => 8773,
00097         'copy'     => 169,
00098         'crarr'    => 8629,
00099         'cup'      => 8746,
00100         'curren'   => 164,
00101         'dagger'   => 8224,
00102         'Dagger'   => 8225,
00103         'darr'     => 8595,
00104         'dArr'     => 8659,
00105         'deg'      => 176,
00106         'Delta'    => 916,
00107         'delta'    => 948,
00108         'diams'    => 9830,
00109         'divide'   => 247,
00110         'Eacute'   => 201,
00111         'eacute'   => 233,
00112         'Ecirc'    => 202,
00113         'ecirc'    => 234,
00114         'Egrave'   => 200,
00115         'egrave'   => 232,
00116         'empty'    => 8709,
00117         'emsp'     => 8195,
00118         'ensp'     => 8194,
00119         'Epsilon'  => 917,
00120         'epsilon'  => 949,
00121         'equiv'    => 8801,
00122         'Eta'      => 919,
00123         'eta'      => 951,
00124         'ETH'      => 208,
00125         'eth'      => 240,
00126         'Euml'     => 203,
00127         'euml'     => 235,
00128         'euro'     => 8364,
00129         'exist'    => 8707,
00130         'fnof'     => 402,
00131         'forall'   => 8704,
00132         'frac12'   => 189,
00133         'frac14'   => 188,
00134         'frac34'   => 190,
00135         'frasl'    => 8260,
00136         'Gamma'    => 915,
00137         'gamma'    => 947,
00138         'ge'       => 8805,
00139         'gt'       => 62,
00140         'harr'     => 8596,
00141         'hArr'     => 8660,
00142         'hearts'   => 9829,
00143         'hellip'   => 8230,
00144         'Iacute'   => 205,
00145         'iacute'   => 237,
00146         'Icirc'    => 206,
00147         'icirc'    => 238,
00148         'iexcl'    => 161,
00149         'Igrave'   => 204,
00150         'igrave'   => 236,
00151         'image'    => 8465,
00152         'infin'    => 8734,
00153         'int'      => 8747,
00154         'Iota'     => 921,
00155         'iota'     => 953,
00156         'iquest'   => 191,
00157         'isin'     => 8712,
00158         'Iuml'     => 207,
00159         'iuml'     => 239,
00160         'Kappa'    => 922,
00161         'kappa'    => 954,
00162         'Lambda'   => 923,
00163         'lambda'   => 955,
00164         'lang'     => 9001,
00165         'laquo'    => 171,
00166         'larr'     => 8592,
00167         'lArr'     => 8656,
00168         'lceil'    => 8968,
00169         'ldquo'    => 8220,
00170         'le'       => 8804,
00171         'lfloor'   => 8970,
00172         'lowast'   => 8727,
00173         'loz'      => 9674,
00174         'lrm'      => 8206,
00175         'lsaquo'   => 8249,
00176         'lsquo'    => 8216,
00177         'lt'       => 60,
00178         'macr'     => 175,
00179         'mdash'    => 8212,
00180         'micro'    => 181,
00181         'middot'   => 183,
00182         'minus'    => 8722,
00183         'Mu'       => 924,
00184         'mu'       => 956,
00185         'nabla'    => 8711,
00186         'nbsp'     => 160,
00187         'ndash'    => 8211,
00188         'ne'       => 8800,
00189         'ni'       => 8715,
00190         'not'      => 172,
00191         'notin'    => 8713,
00192         'nsub'     => 8836,
00193         'Ntilde'   => 209,
00194         'ntilde'   => 241,
00195         'Nu'       => 925,
00196         'nu'       => 957,
00197         'Oacute'   => 211,
00198         'oacute'   => 243,
00199         'Ocirc'    => 212,
00200         'ocirc'    => 244,
00201         'OElig'    => 338,
00202         'oelig'    => 339,
00203         'Ograve'   => 210,
00204         'ograve'   => 242,
00205         'oline'    => 8254,
00206         'Omega'    => 937,
00207         'omega'    => 969,
00208         'Omicron'  => 927,
00209         'omicron'  => 959,
00210         'oplus'    => 8853,
00211         'or'       => 8744,
00212         'ordf'     => 170,
00213         'ordm'     => 186,
00214         'Oslash'   => 216,
00215         'oslash'   => 248,
00216         'Otilde'   => 213,
00217         'otilde'   => 245,
00218         'otimes'   => 8855,
00219         'Ouml'     => 214,
00220         'ouml'     => 246,
00221         'para'     => 182,
00222         'part'     => 8706,
00223         'permil'   => 8240,
00224         'perp'     => 8869,
00225         'Phi'      => 934,
00226         'phi'      => 966,
00227         'Pi'       => 928,
00228         'pi'       => 960,
00229         'piv'      => 982,
00230         'plusmn'   => 177,
00231         'pound'    => 163,
00232         'prime'    => 8242,
00233         'Prime'    => 8243,
00234         'prod'     => 8719,
00235         'prop'     => 8733,
00236         'Psi'      => 936,
00237         'psi'      => 968,
00238         'quot'     => 34,
00239         'radic'    => 8730,
00240         'rang'     => 9002,
00241         'raquo'    => 187,
00242         'rarr'     => 8594,
00243         'rArr'     => 8658,
00244         'rceil'    => 8969,
00245         'rdquo'    => 8221,
00246         'real'     => 8476,
00247         'reg'      => 174,
00248         'rfloor'   => 8971,
00249         'Rho'      => 929,
00250         'rho'      => 961,
00251         'rlm'      => 8207,
00252         'rsaquo'   => 8250,
00253         'rsquo'    => 8217,
00254         'sbquo'    => 8218,
00255         'Scaron'   => 352,
00256         'scaron'   => 353,
00257         'sdot'     => 8901,
00258         'sect'     => 167,
00259         'shy'      => 173,
00260         'Sigma'    => 931,
00261         'sigma'    => 963,
00262         'sigmaf'   => 962,
00263         'sim'      => 8764,
00264         'spades'   => 9824,
00265         'sub'      => 8834,
00266         'sube'     => 8838,
00267         'sum'      => 8721,
00268         'sup'      => 8835,
00269         'sup1'     => 185,
00270         'sup2'     => 178,
00271         'sup3'     => 179,
00272         'supe'     => 8839,
00273         'szlig'    => 223,
00274         'Tau'      => 932,
00275         'tau'      => 964,
00276         'there4'   => 8756,
00277         'Theta'    => 920,
00278         'theta'    => 952,
00279         'thetasym' => 977,
00280         'thinsp'   => 8201,
00281         'THORN'    => 222,
00282         'thorn'    => 254,
00283         'tilde'    => 732,
00284         'times'    => 215,
00285         'trade'    => 8482,
00286         'Uacute'   => 218,
00287         'uacute'   => 250,
00288         'uarr'     => 8593,
00289         'uArr'     => 8657,
00290         'Ucirc'    => 219,
00291         'ucirc'    => 251,
00292         'Ugrave'   => 217,
00293         'ugrave'   => 249,
00294         'uml'      => 168,
00295         'upsih'    => 978,
00296         'Upsilon'  => 933,
00297         'upsilon'  => 965,
00298         'Uuml'     => 220,
00299         'uuml'     => 252,
00300         'weierp'   => 8472,
00301         'Xi'       => 926,
00302         'xi'       => 958,
00303         'Yacute'   => 221,
00304         'yacute'   => 253,
00305         'yen'      => 165,
00306         'Yuml'     => 376,
00307         'yuml'     => 255,
00308         'Zeta'     => 918,
00309         'zeta'     => 950,
00310         'zwj'      => 8205,
00311         'zwnj'     => 8204
00312     );
00313 
00317     private static $htmlEntityAliases = array(
00318         'רלמ' => 'rlm',
00319         'رلم' => 'rlm',
00320     );
00321 
00325     private static $attribsRegex;
00326 
00333     static function getAttribsRegex() {
00334         if ( self::$attribsRegex === null ) {
00335             $attribFirst = '[:A-Z_a-z0-9]';
00336             $attrib = '[:A-Z_a-z-.0-9]';
00337             $space = '[\x09\x0a\x0d\x20]';
00338             self::$attribsRegex =
00339                 "/(?:^|$space)({$attribFirst}{$attrib}*)
00340                   ($space*=$space*
00341                     (?:
00342                      # The attribute value: quoted or alone
00343                       \"([^<\"]*)\"
00344                      | '([^<']*)'
00345                      |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00346                      |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00347                                          # colors are specified like this.
00348                                          # We'll be normalizing it.
00349                     )
00350                 )?(?=$space|\$)/sx";
00351         }
00352         return self::$attribsRegex;
00353     }
00354 
00367     static function removeHTMLtags( $text, $processCallback = null,
00368         $args = array(), $extratags = array(), $removetags = array()
00369     ) {
00370         global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
00371 
00372         static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00373             $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
00374 
00375         wfProfileIn( __METHOD__ );
00376 
00377         // Base our staticInitialised variable off of the global config state so that if the globals
00378         // are changed (like in the screwed up test system) we will re-initialise the settings.
00379         $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
00380         if ( !$staticInitialised || $staticInitialised != $globalContext ) {
00381 
00382             $htmlpairsStatic = array( # Tags that must be closed
00383                 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00384                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00385                 'strike', 'strong', 'tt', 'var', 'div', 'center',
00386                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00387                 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
00388                 'kbd', 'samp', 'data', 'time', 'mark'
00389             );
00390             $htmlsingle = array(
00391                 'br', 'wbr', 'hr', 'li', 'dt', 'dd'
00392             );
00393             $htmlsingleonly = array( # Elements that cannot have close tags
00394                 'br', 'wbr', 'hr'
00395             );
00396             if ( $wgAllowMicrodataAttributes ) {
00397                 $htmlsingle[] = $htmlsingleonly[] = 'meta';
00398                 $htmlsingle[] = $htmlsingleonly[] = 'link';
00399             }
00400             $htmlnest = array( # Tags that can be nested--??
00401                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00402                 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
00403                 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
00404             );
00405             $tabletags = array( # Can only appear inside table, we will close them
00406                 'td', 'th', 'tr',
00407             );
00408             $htmllist = array( # Tags used by list
00409                 'ul', 'ol',
00410             );
00411             $listtags = array( # Tags that can appear in a list
00412                 'li',
00413             );
00414 
00415             if ( $wgAllowImageTag ) {
00416                 $htmlsingle[] = 'img';
00417                 $htmlsingleonly[] = 'img';
00418             }
00419 
00420             $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
00421             $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
00422 
00423             # Convert them all to hashtables for faster lookup
00424             $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00425                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
00426             foreach ( $vars as $var ) {
00427                 $$var = array_flip( $$var );
00428             }
00429             $staticInitialised = $globalContext;
00430         }
00431         # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
00432         $extratags = array_flip( $extratags );
00433         $removetags = array_flip( $removetags );
00434         $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
00435         $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
00436 
00437         # Remove HTML comments
00438         $text = Sanitizer::removeHTMLcomments( $text );
00439         $bits = explode( '<', $text );
00440         $text = str_replace( '>', '&gt;', array_shift( $bits ) );
00441         if ( !$wgUseTidy ) {
00442             $tagstack = $tablestack = array();
00443             foreach ( $bits as $x ) {
00444                 $regs = array();
00445                 # $slash: Does the current element start with a '/'?
00446                 # $t: Current element name
00447                 # $params: String between element name and >
00448                 # $brace: Ending '>' or '/>'
00449                 # $rest: Everything until the next element of $bits
00450                 if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00451                     list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00452                 } else {
00453                     $slash = $t = $params = $brace = $rest = null;
00454                 }
00455 
00456                 $badtag = false;
00457                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00458                     # Check our stack
00459                     if ( $slash && isset( $htmlsingleonly[$t] ) ) {
00460                         $badtag = true;
00461                     } elseif ( $slash ) {
00462                         # Closing a tag... is it the one we just opened?
00463                         wfSuppressWarnings();
00464                         $ot = array_pop( $tagstack );
00465                         wfRestoreWarnings();
00466 
00467                         if ( $ot != $t ) {
00468                             if ( isset( $htmlsingleallowed[$ot] ) ) {
00469                                 # Pop all elements with an optional close tag
00470                                 # and see if we find a match below them
00471                                 $optstack = array();
00472                                 array_push( $optstack, $ot );
00473                                 wfSuppressWarnings();
00474                                 $ot = array_pop( $tagstack );
00475                                 wfRestoreWarnings();
00476                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
00477                                     array_push( $optstack, $ot );
00478                                     wfSuppressWarnings();
00479                                     $ot = array_pop( $tagstack );
00480                                     wfRestoreWarnings();
00481                                 }
00482                                 if ( $t != $ot ) {
00483                                     # No match. Push the optional elements back again
00484                                     $badtag = true;
00485                                     wfSuppressWarnings();
00486                                     $ot = array_pop( $optstack );
00487                                     wfRestoreWarnings();
00488                                     while ( $ot ) {
00489                                         array_push( $tagstack, $ot );
00490                                         wfSuppressWarnings();
00491                                         $ot = array_pop( $optstack );
00492                                         wfRestoreWarnings();
00493                                     }
00494                                 }
00495                             } else {
00496                                 wfSuppressWarnings();
00497                                 array_push( $tagstack, $ot );
00498                                 wfRestoreWarnings();
00499 
00500                                 # <li> can be nested in <ul> or <ol>, skip those cases:
00501                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
00502                                     $badtag = true;
00503                                 }
00504                             }
00505                         } else {
00506                             if ( $t == 'table' ) {
00507                                 $tagstack = array_pop( $tablestack );
00508                             }
00509                         }
00510                         $newparams = '';
00511                     } else {
00512                         # Keep track for later
00513                         if ( isset( $tabletags[$t] ) &&
00514                         !in_array( 'table', $tagstack ) ) {
00515                             $badtag = true;
00516                         } elseif ( in_array( $t, $tagstack ) &&
00517                         !isset( $htmlnest[$t] ) ) {
00518                             $badtag = true;
00519                         # Is it a self closed htmlpair ? (bug 5487)
00520                         } elseif ( $brace == '/>' &&
00521                         isset( $htmlpairs[$t] ) ) {
00522                             $badtag = true;
00523                         } elseif ( isset( $htmlsingleonly[$t] ) ) {
00524                             # Hack to force empty tag for unclosable elements
00525                             $brace = '/>';
00526                         } elseif ( isset( $htmlsingle[$t] ) ) {
00527                             # Hack to not close $htmlsingle tags
00528                             $brace = null;
00529                             # Still need to push this optionally-closed tag to
00530                             # the tag stack so that we can match end tags
00531                             # instead of marking them as bad.
00532                             array_push( $tagstack, $t );
00533                         } elseif ( isset( $tabletags[$t] )
00534                         && in_array( $t, $tagstack ) ) {
00535                             // New table tag but forgot to close the previous one
00536                             $text .= "</$t>";
00537                         } else {
00538                             if ( $t == 'table' ) {
00539                                 array_push( $tablestack, $tagstack );
00540                                 $tagstack = array();
00541                             }
00542                             array_push( $tagstack, $t );
00543                         }
00544 
00545                         # Replace any variables or template parameters with
00546                         # plaintext results.
00547                         if ( is_callable( $processCallback ) ) {
00548                             call_user_func_array( $processCallback, array( &$params, $args ) );
00549                         }
00550 
00551                         if ( !Sanitizer::validateTag( $params, $t ) ) {
00552                             $badtag = true;
00553                         }
00554 
00555                         # Strip non-approved attributes from the tag
00556                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
00557                     }
00558                     if ( !$badtag ) {
00559                         $rest = str_replace( '>', '&gt;', $rest );
00560                         $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00561                         $text .= "<$slash$t$newparams$close>$rest";
00562                         continue;
00563                     }
00564                 }
00565                 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
00566             }
00567             # Close off any remaining tags
00568             while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
00569                 $text .= "</$t>\n";
00570                 if ( $t == 'table' ) {
00571                     $tagstack = array_pop( $tablestack );
00572                 }
00573             }
00574         } else {
00575             # this might be possible using tidy itself
00576             foreach ( $bits as $x ) {
00577                 preg_match(
00578                     '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00579                     $x,
00580                     $regs
00581                 );
00582 
00583                 wfSuppressWarnings();
00584                 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00585                 wfRestoreWarnings();
00586 
00587                 $badtag = false;
00588                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00589                     if ( is_callable( $processCallback ) ) {
00590                         call_user_func_array( $processCallback, array( &$params, $args ) );
00591                     }
00592 
00593                     if ( !Sanitizer::validateTag( $params, $t ) ) {
00594                         $badtag = true;
00595                     }
00596 
00597                     $newparams = Sanitizer::fixTagAttributes( $params, $t );
00598                     if ( !$badtag ) {
00599                         $rest = str_replace( '>', '&gt;', $rest );
00600                         $text .= "<$slash$t$newparams$brace$rest";
00601                         continue;
00602                     }
00603                 }
00604                 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
00605             }
00606         }
00607         wfProfileOut( __METHOD__ );
00608         return $text;
00609     }
00610 
00621     static function removeHTMLcomments( $text ) {
00622         wfProfileIn( __METHOD__ );
00623         while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
00624             $end = strpos( $text, '-->', $start + 4 );
00625             if ( $end === false ) {
00626                 # Unterminated comment; bail out
00627                 break;
00628             }
00629 
00630             $end += 3;
00631 
00632             # Trim space and newline if the comment is both
00633             # preceded and followed by a newline
00634             $spaceStart = max( $start - 1, 0 );
00635             $spaceLen = $end - $spaceStart;
00636             while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
00637                 $spaceStart--;
00638                 $spaceLen++;
00639             }
00640             while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
00641                 $spaceLen++;
00642             }
00643             if ( substr( $text, $spaceStart, 1 ) === "\n"
00644                 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
00645                 # Remove the comment, leading and trailing
00646                 # spaces, and leave only one newline.
00647                 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
00648             } else {
00649                 # Remove just the comment.
00650                 $text = substr_replace( $text, '', $start, $end - $start );
00651             }
00652         }
00653         wfProfileOut( __METHOD__ );
00654         return $text;
00655     }
00656 
00669     static function validateTag( $params, $element ) {
00670         $params = Sanitizer::decodeTagAttributes( $params );
00671 
00672         if ( $element == 'meta' || $element == 'link' ) {
00673             if ( !isset( $params['itemprop'] ) ) {
00674                 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
00675                 return false;
00676             }
00677             if ( $element == 'meta' && !isset( $params['content'] ) ) {
00678                 // <meta> must have a content="" for the itemprop
00679                 return false;
00680             }
00681             if ( $element == 'link' && !isset( $params['href'] ) ) {
00682                 // <link> must have an associated href=""
00683                 return false;
00684             }
00685         }
00686 
00687         return true;
00688     }
00689 
00705     static function validateTagAttributes( $attribs, $element ) {
00706         return Sanitizer::validateAttributes( $attribs,
00707             Sanitizer::attributeWhitelist( $element ) );
00708     }
00709 
00725     static function validateAttributes( $attribs, $whitelist ) {
00726         global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
00727 
00728         $whitelist = array_flip( $whitelist );
00729         $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
00730 
00731         $out = array();
00732         foreach ( $attribs as $attribute => $value ) {
00733             #allow XML namespace declaration if RDFa is enabled
00734             if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
00735                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00736                     $out[$attribute] = $value;
00737                 }
00738 
00739                 continue;
00740             }
00741 
00742             # Allow any attribute beginning with "data-"
00743             if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
00744                 continue;
00745             }
00746 
00747             # Strip javascript "expression" from stylesheets.
00748             # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00749             if ( $attribute == 'style' ) {
00750                 $value = Sanitizer::checkCss( $value );
00751             }
00752 
00753             if ( $attribute === 'id' ) {
00754                 $value = Sanitizer::escapeId( $value, 'noninitial' );
00755             }
00756 
00757             # WAI-ARIA
00758             # http://www.w3.org/TR/wai-aria/
00759             # http://www.whatwg.org/html/elements.html#wai-aria
00760             # For now we only support role="presentation" until we work out what roles should be
00761             # usable by content and we ensure that our code explicitly rejects patterns that
00762             # violate HTML5's ARIA restrictions.
00763             if ( $attribute === 'role' && $value !== 'presentation' ) {
00764                 continue;
00765             }
00766 
00767             // RDFa and microdata properties allow URLs, URIs and/or CURIs.
00768             // Check them for sanity.
00769             if ( $attribute === 'rel' || $attribute === 'rev'
00770                 # RDFa
00771                 || $attribute === 'about' || $attribute === 'property'
00772                 || $attribute === 'resource' || $attribute === 'datatype'
00773                 || $attribute === 'typeof'
00774                 # HTML5 microdata
00775                 || $attribute === 'itemid' || $attribute === 'itemprop'
00776                 || $attribute === 'itemref' || $attribute === 'itemscope'
00777                 || $attribute === 'itemtype'
00778             ) {
00779                 //Paranoia. Allow "simple" values but suppress javascript
00780                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00781                     continue;
00782                 }
00783             }
00784 
00785             # NOTE: even though elements using href/src are not allowed directly, supply
00786             #       validation code that can be used by tag hook handlers, etc
00787             if ( $attribute === 'href' || $attribute === 'src' ) {
00788                 if ( !preg_match( $hrefExp, $value ) ) {
00789                     continue; //drop any href or src attributes not using an allowed protocol.
00790                     // NOTE: this also drops all relative URLs
00791                 }
00792             }
00793 
00794             // If this attribute was previously set, override it.
00795             // Output should only have one attribute of each name.
00796             $out[$attribute] = $value;
00797         }
00798 
00799         if ( $wgAllowMicrodataAttributes ) {
00800             # itemtype, itemid, itemref don't make sense without itemscope
00801             if ( !array_key_exists( 'itemscope', $out ) ) {
00802                 unset( $out['itemtype'] );
00803                 unset( $out['itemid'] );
00804                 unset( $out['itemref'] );
00805             }
00806             # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
00807         }
00808         return $out;
00809     }
00810 
00821     static function mergeAttributes( $a, $b ) {
00822         $out = array_merge( $a, $b );
00823         if ( isset( $a['class'] ) && isset( $b['class'] )
00824             && is_string( $a['class'] ) && is_string( $b['class'] )
00825             && $a['class'] !== $b['class']
00826         ) {
00827             $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00828                 -1, PREG_SPLIT_NO_EMPTY );
00829             $out['class'] = implode( ' ', array_unique( $classes ) );
00830         }
00831         return $out;
00832     }
00833 
00843     public static function normalizeCss( $value ) {
00844 
00845         // Decode character references like &#123;
00846         $value = Sanitizer::decodeCharReferences( $value );
00847 
00848         // Decode escape sequences and line continuation
00849         // See the grammar in the CSS 2 spec, appendix D.
00850         // This has to be done AFTER decoding character references.
00851         // This means it isn't possible for this function to return
00852         // unsanitized escape sequences. It is possible to manufacture
00853         // input that contains character references that decode to
00854         // escape sequences that decode to character references, but
00855         // it's OK for the return value to contain character references
00856         // because the caller is supposed to escape those anyway.
00857         static $decodeRegex;
00858         if ( !$decodeRegex ) {
00859             $space = '[\\x20\\t\\r\\n\\f]';
00860             $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00861             $backslash = '\\\\';
00862             $decodeRegex = "/ $backslash
00863                 (?:
00864                     ($nl) |  # 1. Line continuation
00865                     ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
00866                     (.) | # 3. backslash cancelling special meaning
00867                     () | # 4. backslash at end of string
00868                 )/xu";
00869         }
00870         $value = preg_replace_callback( $decodeRegex,
00871             array( __CLASS__, 'cssDecodeCallback' ), $value );
00872 
00873         // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
00874         $value = preg_replace_callback(
00875             '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
00876             function ( $matches ) {
00877                 $cp = utf8ToCodepoint( $matches[0] );
00878                 if ( $cp === false ) {
00879                     return '';
00880                 }
00881                 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
00882             },
00883             $value
00884         );
00885 
00886         // Convert more characters IE6 might treat as ascii
00887         // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
00888         $value = str_replace(
00889             array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
00890             array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
00891             $value
00892         );
00893 
00894         // Let the value through if it's nothing but a single comment, to
00895         // allow other functions which may reject it to pass some error
00896         // message through.
00897         if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
00898             // Remove any comments; IE gets token splitting wrong
00899             // This must be done AFTER decoding character references and
00900             // escape sequences, because those steps can introduce comments
00901             // This step cannot introduce character references or escape
00902             // sequences, because it replaces comments with spaces rather
00903             // than removing them completely.
00904             $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00905 
00906             // Remove anything after a comment-start token, to guard against
00907             // incorrect client implementations.
00908             $commentPos = strpos( $value, '/*' );
00909             if ( $commentPos !== false ) {
00910                 $value = substr( $value, 0, $commentPos );
00911             }
00912         }
00913 
00914         // S followed by repeat, iteration, or prolonged sound marks,
00915         // which IE will treat as "ss"
00916         $value = preg_replace(
00917             '/s(?:
00918                 \xE3\x80\xB1 | # U+3031
00919                 \xE3\x82\x9D | # U+309D
00920                 \xE3\x83\xBC | # U+30FC
00921                 \xE3\x83\xBD | # U+30FD
00922                 \xEF\xB9\xBC | # U+FE7C
00923                 \xEF\xB9\xBD | # U+FE7D
00924                 \xEF\xBD\xB0   # U+FF70
00925             )/ix',
00926             'ss',
00927             $value
00928         );
00929 
00930         return $value;
00931     }
00932 
00933 
00952     static function checkCss( $value ) {
00953         $value = self::normalizeCss( $value );
00954 
00955         // Reject problematic keywords and control characters
00956         if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
00957             return '/* invalid control char */';
00958         } elseif ( preg_match(
00959             '! expression
00960                 | filter\s*:
00961                 | accelerator\s*:
00962                 | -o-link\s*:
00963                 | -o-link-source\s*:
00964                 | -o-replace\s*:
00965                 | url\s*\(
00966                 | image\s*\(
00967                 | image-set\s*\(
00968             !ix', $value ) ) {
00969             return '/* insecure input */';
00970         }
00971         return $value;
00972     }
00973 
00978     static function cssDecodeCallback( $matches ) {
00979         if ( $matches[1] !== '' ) {
00980             // Line continuation
00981             return '';
00982         } elseif ( $matches[2] !== '' ) {
00983             $char = codepointToUtf8( hexdec( $matches[2] ) );
00984         } elseif ( $matches[3] !== '' ) {
00985             $char = $matches[3];
00986         } else {
00987             $char = '\\';
00988         }
00989         if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00990             // These characters need to be escaped in strings
00991             // Clean up the escape sequence to avoid parsing errors by clients
00992             return '\\' . dechex( ord( $char ) ) . ' ';
00993         } else {
00994             // Decode unnecessary escape
00995             return $char;
00996         }
00997     }
00998 
01018     static function fixTagAttributes( $text, $element ) {
01019         if ( trim( $text ) == '' ) {
01020             return '';
01021         }
01022 
01023         $decoded = Sanitizer::decodeTagAttributes( $text );
01024         $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
01025 
01026         return Sanitizer::safeEncodeTagAttributes( $stripped );
01027     }
01028 
01034     static function encodeAttribute( $text ) {
01035         $encValue = htmlspecialchars( $text, ENT_QUOTES );
01036 
01037         // Whitespace is normalized during attribute decoding,
01038         // so if we've been passed non-spaces we must encode them
01039         // ahead of time or they won't be preserved.
01040         $encValue = strtr( $encValue, array(
01041             "\n" => '&#10;',
01042             "\r" => '&#13;',
01043             "\t" => '&#9;',
01044         ) );
01045 
01046         return $encValue;
01047     }
01048 
01055     static function safeEncodeAttribute( $text ) {
01056         $encValue = Sanitizer::encodeAttribute( $text );
01057 
01058         # Templates and links may be expanded in later parsing,
01059         # creating invalid or dangerous output. Suppress this.
01060         $encValue = strtr( $encValue, array(
01061             '<'    => '&lt;',   // This should never happen,
01062             '>'    => '&gt;',   // we've received invalid input
01063             '"'    => '&quot;', // which should have been escaped.
01064             '{'    => '&#123;',
01065             '['    => '&#91;',
01066             "''"   => '&#39;&#39;',
01067             'ISBN' => '&#73;SBN',
01068             'RFC'  => '&#82;FC',
01069             'PMID' => '&#80;MID',
01070             '|'    => '&#124;',
01071             '__'   => '&#95;_',
01072         ) );
01073 
01074         # Stupid hack
01075         $encValue = preg_replace_callback(
01076             '/((?i)' . wfUrlProtocols() . ')/',
01077             array( 'Sanitizer', 'armorLinksCallback' ),
01078             $encValue );
01079         return $encValue;
01080     }
01081 
01113     static function escapeId( $id, $options = array() ) {
01114         global $wgExperimentalHtmlIds;
01115         $options = (array)$options;
01116 
01117         $id = Sanitizer::decodeCharReferences( $id );
01118 
01119         if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
01120             $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
01121             $id = trim( $id, '_' );
01122             if ( $id === '' ) {
01123                 # Must have been all whitespace to start with.
01124                 return '_';
01125             } else {
01126                 return $id;
01127             }
01128         }
01129 
01130         # HTML4-style escaping
01131         static $replace = array(
01132             '%3A' => ':',
01133             '%' => '.'
01134         );
01135 
01136         $id = urlencode( strtr( $id, ' ', '_' ) );
01137         $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
01138 
01139         if ( !preg_match( '/^[a-zA-Z]/', $id )
01140         && !in_array( 'noninitial', $options ) ) {
01141             // Initial character must be a letter!
01142             $id = "x$id";
01143         }
01144         return $id;
01145     }
01146 
01158     static function escapeClass( $class ) {
01159         // Convert ugly stuff to underscores and kill underscores in ugly places
01160         return rtrim( preg_replace(
01161             array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
01162             '_',
01163             $class ), '_' );
01164     }
01165 
01173     static function escapeHtmlAllowEntities( $html ) {
01174         $html = Sanitizer::decodeCharReferences( $html );
01175         # It seems wise to escape ' as well as ", as a matter of course.  Can't
01176         # hurt.
01177         $html = htmlspecialchars( $html, ENT_QUOTES );
01178         return $html;
01179     }
01180 
01186     private static function armorLinksCallback( $matches ) {
01187         return str_replace( ':', '&#58;', $matches[1] );
01188     }
01189 
01198     public static function decodeTagAttributes( $text ) {
01199         if ( trim( $text ) == '' ) {
01200             return array();
01201         }
01202 
01203         $attribs = array();
01204         $pairs = array();
01205         if ( !preg_match_all(
01206             self::getAttribsRegex(),
01207             $text,
01208             $pairs,
01209             PREG_SET_ORDER ) ) {
01210             return $attribs;
01211         }
01212 
01213         foreach ( $pairs as $set ) {
01214             $attribute = strtolower( $set[1] );
01215             $value = Sanitizer::getTagAttributeCallback( $set );
01216 
01217             // Normalize whitespace
01218             $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
01219             $value = trim( $value );
01220 
01221             // Decode character references
01222             $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
01223         }
01224         return $attribs;
01225     }
01226 
01234     public static function safeEncodeTagAttributes( $assoc_array ) {
01235         $attribs = array();
01236         foreach ( $assoc_array as $attribute => $value ) {
01237             $encAttribute = htmlspecialchars( $attribute );
01238             $encValue = Sanitizer::safeEncodeAttribute( $value );
01239 
01240             $attribs[] = "$encAttribute=\"$encValue\"";
01241         }
01242         return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
01243     }
01244 
01253     private static function getTagAttributeCallback( $set ) {
01254         if ( isset( $set[6] ) ) {
01255             # Illegal #XXXXXX color with no quotes.
01256             return $set[6];
01257         } elseif ( isset( $set[5] ) ) {
01258             # No quotes.
01259             return $set[5];
01260         } elseif ( isset( $set[4] ) ) {
01261             # Single-quoted
01262             return $set[4];
01263         } elseif ( isset( $set[3] ) ) {
01264             # Double-quoted
01265             return $set[3];
01266         } elseif ( !isset( $set[2] ) ) {
01267             # In XHTML, attributes must have a value.
01268             # For 'reduced' form, return explicitly the attribute name here.
01269             return $set[1];
01270         } else {
01271             throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
01272         }
01273     }
01274 
01287     private static function normalizeAttributeValue( $text ) {
01288         return str_replace( '"', '&quot;',
01289             self::normalizeWhitespace(
01290                 Sanitizer::normalizeCharReferences( $text ) ) );
01291     }
01292 
01297     private static function normalizeWhitespace( $text ) {
01298         return preg_replace(
01299             '/\r\n|[\x20\x0d\x0a\x09]/',
01300             ' ',
01301             $text );
01302     }
01303 
01312     static function normalizeSectionNameWhitespace( $section ) {
01313         return trim( preg_replace( '/[ _]+/', ' ', $section ) );
01314     }
01315 
01331     static function normalizeCharReferences( $text ) {
01332         return preg_replace_callback(
01333             self::CHAR_REFS_REGEX,
01334             array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01335             $text );
01336     }
01337 
01342     static function normalizeCharReferencesCallback( $matches ) {
01343         $ret = null;
01344         if ( $matches[1] != '' ) {
01345             $ret = Sanitizer::normalizeEntity( $matches[1] );
01346         } elseif ( $matches[2] != '' ) {
01347             $ret = Sanitizer::decCharReference( $matches[2] );
01348         } elseif ( $matches[3] != '' ) {
01349             $ret = Sanitizer::hexCharReference( $matches[3] );
01350         }
01351         if ( is_null( $ret ) ) {
01352             return htmlspecialchars( $matches[0] );
01353         } else {
01354             return $ret;
01355         }
01356     }
01357 
01368     static function normalizeEntity( $name ) {
01369         if ( isset( self::$htmlEntityAliases[$name] ) ) {
01370             return '&' . self::$htmlEntityAliases[$name] . ';';
01371         } elseif ( in_array( $name,
01372         array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
01373             return "&$name;";
01374         } elseif ( isset( self::$htmlEntities[$name] ) ) {
01375             return '&#' . self::$htmlEntities[$name] . ';';
01376         } else {
01377             return "&amp;$name;";
01378         }
01379     }
01380 
01385     static function decCharReference( $codepoint ) {
01386         $point = intval( $codepoint );
01387         if ( Sanitizer::validateCodepoint( $point ) ) {
01388             return sprintf( '&#%d;', $point );
01389         } else {
01390             return null;
01391         }
01392     }
01393 
01398     static function hexCharReference( $codepoint ) {
01399         $point = hexdec( $codepoint );
01400         if ( Sanitizer::validateCodepoint( $point ) ) {
01401             return sprintf( '&#x%x;', $point );
01402         } else {
01403             return null;
01404         }
01405     }
01406 
01412     private static function validateCodepoint( $codepoint ) {
01413         return $codepoint == 0x09
01414             || $codepoint == 0x0a
01415             || $codepoint == 0x0d
01416             || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
01417             || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
01418             || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
01419     }
01420 
01428     public static function decodeCharReferences( $text ) {
01429         return preg_replace_callback(
01430             self::CHAR_REFS_REGEX,
01431             array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01432             $text );
01433     }
01434 
01445     public static function decodeCharReferencesAndNormalize( $text ) {
01446         global $wgContLang;
01447         $text = preg_replace_callback(
01448             self::CHAR_REFS_REGEX,
01449             array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01450             $text, /* limit */ -1, $count );
01451 
01452         if ( $count ) {
01453             return $wgContLang->normalize( $text );
01454         } else {
01455             return $text;
01456         }
01457     }
01458 
01463     static function decodeCharReferencesCallback( $matches ) {
01464         if ( $matches[1] != '' ) {
01465             return Sanitizer::decodeEntity( $matches[1] );
01466         } elseif ( $matches[2] != '' ) {
01467             return Sanitizer::decodeChar( intval( $matches[2] ) );
01468         } elseif ( $matches[3] != '' ) {
01469             return Sanitizer::decodeChar( hexdec( $matches[3] ) );
01470         }
01471         # Last case should be an ampersand by itself
01472         return $matches[0];
01473     }
01474 
01482     static function decodeChar( $codepoint ) {
01483         if ( Sanitizer::validateCodepoint( $codepoint ) ) {
01484             return codepointToUtf8( $codepoint );
01485         } else {
01486             return UTF8_REPLACEMENT;
01487         }
01488     }
01489 
01498     static function decodeEntity( $name ) {
01499         if ( isset( self::$htmlEntityAliases[$name] ) ) {
01500             $name = self::$htmlEntityAliases[$name];
01501         }
01502         if ( isset( self::$htmlEntities[$name] ) ) {
01503             return codepointToUtf8( self::$htmlEntities[$name] );
01504         } else {
01505             return "&$name;";
01506         }
01507     }
01508 
01515     static function attributeWhitelist( $element ) {
01516         $list = Sanitizer::setupAttributeWhitelist();
01517         return isset( $list[$element] )
01518             ? $list[$element]
01519             : array();
01520     }
01521 
01527     static function setupAttributeWhitelist() {
01528         global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
01529         static $whitelist, $staticInitialised;
01530 
01531         $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
01532 
01533         if ( $whitelist !== null && $staticInitialised == $globalContext ) {
01534             return $whitelist;
01535         }
01536 
01537         $common = array(
01538             # HTML
01539             'id',
01540             'class',
01541             'style',
01542             'lang',
01543             'dir',
01544             'title',
01545 
01546             # WAI-ARIA
01547             'role',
01548         );
01549 
01550         if ( $wgAllowRdfaAttributes ) {
01551             # RDFa attributes as specified in section 9 of
01552             # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
01553             $common = array_merge( $common, array(
01554                 'about', 'property', 'resource', 'datatype', 'typeof',
01555             ) );
01556         }
01557 
01558         if ( $wgAllowMicrodataAttributes ) {
01559             # add HTML5 microdata tags as specified by
01560             # http://www.whatwg.org/html/microdata.html#the-microdata-model
01561             $common = array_merge( $common, array(
01562                 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
01563             ) );
01564         }
01565 
01566         $block = array_merge( $common, array( 'align' ) );
01567         $tablealign = array( 'align', 'valign' );
01568         $tablecell = array(
01569             'abbr',
01570             'axis',
01571             'headers',
01572             'scope',
01573             'rowspan',
01574             'colspan',
01575             'nowrap', # deprecated
01576             'width', # deprecated
01577             'height', # deprecated
01578             'bgcolor', # deprecated
01579         );
01580 
01581         # Numbers refer to sections in HTML 4.01 standard describing the element.
01582         # See: http://www.w3.org/TR/html4/
01583         $whitelist = array(
01584             # 7.5.4
01585             'div'        => $block,
01586             'center'     => $common, # deprecated
01587             'span'       => $common,
01588 
01589             # 7.5.5
01590             'h1'         => $block,
01591             'h2'         => $block,
01592             'h3'         => $block,
01593             'h4'         => $block,
01594             'h5'         => $block,
01595             'h6'         => $block,
01596 
01597             # 7.5.6
01598             # address
01599 
01600             # 8.2.4
01601             'bdo'        => $common,
01602 
01603             # 9.2.1
01604             'em'         => $common,
01605             'strong'     => $common,
01606             'cite'       => $common,
01607             'dfn'        => $common,
01608             'code'       => $common,
01609             'samp'       => $common,
01610             'kbd'        => $common,
01611             'var'        => $common,
01612             'abbr'       => $common,
01613             # acronym
01614 
01615             # 9.2.2
01616             'blockquote' => array_merge( $common, array( 'cite' ) ),
01617             'q'          => array_merge( $common, array( 'cite' ) ),
01618 
01619             # 9.2.3
01620             'sub'        => $common,
01621             'sup'        => $common,
01622 
01623             # 9.3.1
01624             'p'          => $block,
01625 
01626             # 9.3.2
01627             'br'         => array_merge( $common, array( 'clear' ) ),
01628 
01629             # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
01630             'wbr'        => $common,
01631 
01632             # 9.3.4
01633             'pre'        => array_merge( $common, array( 'width' ) ),
01634 
01635             # 9.4
01636             'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01637             'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01638 
01639             # 10.2
01640             'ul'         => array_merge( $common, array( 'type' ) ),
01641             'ol'         => array_merge( $common, array( 'type', 'start' ) ),
01642             'li'         => array_merge( $common, array( 'type', 'value' ) ),
01643 
01644             # 10.3
01645             'dl'         => $common,
01646             'dd'         => $common,
01647             'dt'         => $common,
01648 
01649             # 11.2.1
01650             'table'      => array_merge( $common,
01651                                 array( 'summary', 'width', 'border', 'frame',
01652                                         'rules', 'cellspacing', 'cellpadding',
01653                                         'align', 'bgcolor',
01654                                 ) ),
01655 
01656             # 11.2.2
01657             'caption'    => $block,
01658 
01659             # 11.2.3
01660             'thead'      => $common,
01661             'tfoot'      => $common,
01662             'tbody'      => $common,
01663 
01664             # 11.2.4
01665             'colgroup'   => array_merge( $common, array( 'span' ) ),
01666             'col'        => array_merge( $common, array( 'span' ) ),
01667 
01668             # 11.2.5
01669             'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01670 
01671             # 11.2.6
01672             'td'         => array_merge( $common, $tablecell, $tablealign ),
01673             'th'         => array_merge( $common, $tablecell, $tablealign ),
01674 
01675             # 12.2
01676             # NOTE: <a> is not allowed directly, but the attrib
01677             # whitelist is used from the Parser object
01678             'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
01679 
01680             # 13.2
01681             # Not usually allowed, but may be used for extension-style hooks
01682             # such as <math> when it is rasterized, or if $wgAllowImageTag is
01683             # true
01684             'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
01685 
01686             # 15.2.1
01687             'tt'         => $common,
01688             'b'          => $common,
01689             'i'          => $common,
01690             'big'        => $common,
01691             'small'      => $common,
01692             'strike'     => $common,
01693             's'          => $common,
01694             'u'          => $common,
01695 
01696             # 15.2.2
01697             'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
01698             # basefont
01699 
01700             # 15.3
01701             'hr'         => array_merge( $common, array( 'width' ) ),
01702 
01703             # HTML Ruby annotation text module, simple ruby only.
01704             # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
01705             'ruby'       => $common,
01706             # rbc
01707             'rb'         => $common,
01708             'rp'         => $common,
01709             'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
01710             'rtc'         => $common,
01711 
01712             # MathML root element, where used for extensions
01713             # 'title' may not be 100% valid here; it's XHTML
01714             # http://www.w3.org/TR/REC-MathML/
01715             'math'       => array( 'class', 'style', 'id', 'title' ),
01716 
01717             # HTML 5 section 4.6
01718             'bdi' => $common,
01719 
01720             # HTML5 elements, defined by:
01721             # http://www.whatwg.org/html/
01722             'data' => array_merge( $common, array( 'value' ) ),
01723             'time' => array_merge( $common, array( 'datetime' ) ),
01724             'mark' => $common,
01725 
01726             // meta and link are only permitted by removeHTMLtags when Microdata
01727             // is enabled so we don't bother adding a conditional to hide these
01728             // Also meta and link are only valid in WikiText as Microdata elements
01729             // (ie: validateTag rejects tags missing the attributes needed for Microdata)
01730             // So we don't bother including $common attributes that have no purpose.
01731             'meta' => array( 'itemprop', 'content' ),
01732             'link' => array( 'itemprop', 'href' ),
01733         );
01734 
01735         $staticInitialised = $globalContext;
01736 
01737         return $whitelist;
01738     }
01739 
01750     static function stripAllTags( $text ) {
01751         # Actual <tags>
01752         $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01753 
01754         # Normalize &entities and whitespace
01755         $text = self::decodeCharReferences( $text );
01756         $text = self::normalizeWhitespace( $text );
01757 
01758         return $text;
01759     }
01760 
01770     static function hackDocType() {
01771         $out = "<!DOCTYPE html [\n";
01772         foreach ( self::$htmlEntities as $entity => $codepoint ) {
01773             $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01774         }
01775         $out .= "]>\n";
01776         return $out;
01777     }
01778 
01783     static function cleanUrl( $url ) {
01784         # Normalize any HTML entities in input. They will be
01785         # re-escaped by makeExternalLink().
01786         $url = Sanitizer::decodeCharReferences( $url );
01787 
01788         # Escape any control characters introduced by the above step
01789         $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
01790             array( __CLASS__, 'cleanUrlCallback' ), $url );
01791 
01792         # Validate hostname portion
01793         $matches = array();
01794         if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01795             list( /* $whole */, $protocol, $host, $rest ) = $matches;
01796 
01797             // Characters that will be ignored in IDNs.
01798             // http://tools.ietf.org/html/3454#section-3.1
01799             // Strip them before further processing so blacklists and such work.
01800             $strip = "/
01801                 \\s|          # general whitespace
01802                 \xc2\xad|     # 00ad SOFT HYPHEN
01803                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01804                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01805                 \xe2\x81\xa0| # 2060 WORD JOINER
01806                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01807                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
01808                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01809                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01810                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01811                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01812                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01813                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
01814                 /xuD";
01815 
01816             $host = preg_replace( $strip, '', $host );
01817 
01818             // @todo FIXME: Validate hostnames here
01819 
01820             return $protocol . $host . $rest;
01821         } else {
01822             return $url;
01823         }
01824     }
01825 
01830     static function cleanUrlCallback( $matches ) {
01831         return urlencode( $matches[0] );
01832     }
01833 
01862     public static function validateEmail( $addr ) {
01863         $result = null;
01864         if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
01865             return $result;
01866         }
01867 
01868         // Please note strings below are enclosed in brackets [], this make the
01869         // hyphen "-" a range indicator. Hence it is double backslashed below.
01870         // See bug 26948
01871         $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
01872         $rfc1034_ldh_str = "a-z0-9\\-";
01873 
01874         $html5_email_regexp = "/
01875         ^                      # start of string
01876         [$rfc5322_atext\\.]+    # user part which is liberal :p
01877         @                      # 'apostrophe'
01878         [$rfc1034_ldh_str]+       # First domain part
01879         (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
01880         $                      # End of string
01881         /ix"; // case Insensitive, eXtended
01882 
01883         return (bool)preg_match( $html5_email_regexp, $addr );
01884     }
01885 }