php/html/Sanitizer_8php_source.html

00001 <?php
00031 class Sanitizer {
00036     const CHAR_REFS_REGEX =
00037         '/&([A-Za-z0-9\x80-\xff]+);
00038          |&\#([0-9]+);
00039          |&\#[xX]([0-9A-Fa-f]+);
00040          |(&)/x';
00041
00050     const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
00051     const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
00052
00058     private static $htmlEntities = array(
00059         'Aacute'   => 193,
00060         'aacute'   => 225,
00061         'Acirc'    => 194,
00062         'acirc'    => 226,
00063         'acute'    => 180,
00064         'AElig'    => 198,
00065         'aelig'    => 230,
00066         'Agrave'   => 192,
00067         'agrave'   => 224,
00068         'alefsym'  => 8501,
00069         'Alpha'    => 913,
00070         'alpha'    => 945,
00071         'amp'      => 38,
00072         'and'      => 8743,
00073         'ang'      => 8736,
00074         'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
00075         'Aring'    => 197,
00076         'aring'    => 229,
00077         'asymp'    => 8776,
00078         'Atilde'   => 195,
00079         'atilde'   => 227,
00080         'Auml'     => 196,
00081         'auml'     => 228,
00082         'bdquo'    => 8222,
00083         'Beta'     => 914,
00084         'beta'     => 946,
00085         'brvbar'   => 166,
00086         'bull'     => 8226,
00087         'cap'      => 8745,
00088         'Ccedil'   => 199,
00089         'ccedil'   => 231,
00090         'cedil'    => 184,
00091         'cent'     => 162,
00092         'Chi'      => 935,
00093         'chi'      => 967,
00094         'circ'     => 710,
00095         'clubs'    => 9827,
00096         'cong'     => 8773,
00097         'copy'     => 169,
00098         'crarr'    => 8629,
00099         'cup'      => 8746,
00100         'curren'   => 164,
00101         'dagger'   => 8224,
00102         'Dagger'   => 8225,
00103         'darr'     => 8595,
00104         'dArr'     => 8659,
00105         'deg'      => 176,
00106         'Delta'    => 916,
00107         'delta'    => 948,
00108         'diams'    => 9830,
00109         'divide'   => 247,
00110         'Eacute'   => 201,
00111         'eacute'   => 233,
00112         'Ecirc'    => 202,
00113         'ecirc'    => 234,
00114         'Egrave'   => 200,
00115         'egrave'   => 232,
00116         'empty'    => 8709,
00117         'emsp'     => 8195,
00118         'ensp'     => 8194,
00119         'Epsilon'  => 917,
00120         'epsilon'  => 949,
00121         'equiv'    => 8801,
00122         'Eta'      => 919,
00123         'eta'      => 951,
00124         'ETH'      => 208,
00125         'eth'      => 240,
00126         'Euml'     => 203,
00127         'euml'     => 235,
00128         'euro'     => 8364,
00129         'exist'    => 8707,
00130         'fnof'     => 402,
00131         'forall'   => 8704,
00132         'frac12'   => 189,
00133         'frac14'   => 188,
00134         'frac34'   => 190,
00135         'frasl'    => 8260,
00136         'Gamma'    => 915,
00137         'gamma'    => 947,
00138         'ge'       => 8805,
00139         'gt'       => 62,
00140         'harr'     => 8596,
00141         'hArr'     => 8660,
00142         'hearts'   => 9829,
00143         'hellip'   => 8230,
00144         'Iacute'   => 205,
00145         'iacute'   => 237,
00146         'Icirc'    => 206,
00147         'icirc'    => 238,
00148         'iexcl'    => 161,
00149         'Igrave'   => 204,
00150         'igrave'   => 236,
00151         'image'    => 8465,
00152         'infin'    => 8734,
00153         'int'      => 8747,
00154         'Iota'     => 921,
00155         'iota'     => 953,
00156         'iquest'   => 191,
00157         'isin'     => 8712,
00158         'Iuml'     => 207,
00159         'iuml'     => 239,
00160         'Kappa'    => 922,
00161         'kappa'    => 954,
00162         'Lambda'   => 923,
00163         'lambda'   => 955,
00164         'lang'     => 9001,
00165         'laquo'    => 171,
00166         'larr'     => 8592,
00167         'lArr'     => 8656,
00168         'lceil'    => 8968,
00169         'ldquo'    => 8220,
00170         'le'       => 8804,
00171         'lfloor'   => 8970,
00172         'lowast'   => 8727,
00173         'loz'      => 9674,
00174         'lrm'      => 8206,
00175         'lsaquo'   => 8249,
00176         'lsquo'    => 8216,
00177         'lt'       => 60,
00178         'macr'     => 175,
00179         'mdash'    => 8212,
00180         'micro'    => 181,
00181         'middot'   => 183,
00182         'minus'    => 8722,
00183         'Mu'       => 924,
00184         'mu'       => 956,
00185         'nabla'    => 8711,
00186         'nbsp'     => 160,
00187         'ndash'    => 8211,
00188         'ne'       => 8800,
00189         'ni'       => 8715,
00190         'not'      => 172,
00191         'notin'    => 8713,
00192         'nsub'     => 8836,
00193         'Ntilde'   => 209,
00194         'ntilde'   => 241,
00195         'Nu'       => 925,
00196         'nu'       => 957,
00197         'Oacute'   => 211,
00198         'oacute'   => 243,
00199         'Ocirc'    => 212,
00200         'ocirc'    => 244,
00201         'OElig'    => 338,
00202         'oelig'    => 339,
00203         'Ograve'   => 210,
00204         'ograve'   => 242,
00205         'oline'    => 8254,
00206         'Omega'    => 937,
00207         'omega'    => 969,
00208         'Omicron'  => 927,
00209         'omicron'  => 959,
00210         'oplus'    => 8853,
00211         'or'       => 8744,
00212         'ordf'     => 170,
00213         'ordm'     => 186,
00214         'Oslash'   => 216,
00215         'oslash'   => 248,
00216         'Otilde'   => 213,
00217         'otilde'   => 245,
00218         'otimes'   => 8855,
00219         'Ouml'     => 214,
00220         'ouml'     => 246,
00221         'para'     => 182,
00222         'part'     => 8706,
00223         'permil'   => 8240,
00224         'perp'     => 8869,
00225         'Phi'      => 934,
00226         'phi'      => 966,
00227         'Pi'       => 928,
00228         'pi'       => 960,
00229         'piv'      => 982,
00230         'plusmn'   => 177,
00231         'pound'    => 163,
00232         'prime'    => 8242,
00233         'Prime'    => 8243,
00234         'prod'     => 8719,
00235         'prop'     => 8733,
00236         'Psi'      => 936,
00237         'psi'      => 968,
00238         'quot'     => 34,
00239         'radic'    => 8730,
00240         'rang'     => 9002,
00241         'raquo'    => 187,
00242         'rarr'     => 8594,
00243         'rArr'     => 8658,
00244         'rceil'    => 8969,
00245         'rdquo'    => 8221,
00246         'real'     => 8476,
00247         'reg'      => 174,
00248         'rfloor'   => 8971,
00249         'Rho'      => 929,
00250         'rho'      => 961,
00251         'rlm'      => 8207,
00252         'rsaquo'   => 8250,
00253         'rsquo'    => 8217,
00254         'sbquo'    => 8218,
00255         'Scaron'   => 352,
00256         'scaron'   => 353,
00257         'sdot'     => 8901,
00258         'sect'     => 167,
00259         'shy'      => 173,
00260         'Sigma'    => 931,
00261         'sigma'    => 963,
00262         'sigmaf'   => 962,
00263         'sim'      => 8764,
00264         'spades'   => 9824,
00265         'sub'      => 8834,
00266         'sube'     => 8838,
00267         'sum'      => 8721,
00268         'sup'      => 8835,
00269         'sup1'     => 185,
00270         'sup2'     => 178,
00271         'sup3'     => 179,
00272         'supe'     => 8839,
00273         'szlig'    => 223,
00274         'Tau'      => 932,
00275         'tau'      => 964,
00276         'there4'   => 8756,
00277         'Theta'    => 920,
00278         'theta'    => 952,
00279         'thetasym' => 977,
00280         'thinsp'   => 8201,
00281         'THORN'    => 222,
00282         'thorn'    => 254,
00283         'tilde'    => 732,
00284         'times'    => 215,
00285         'trade'    => 8482,
00286         'Uacute'   => 218,
00287         'uacute'   => 250,
00288         'uarr'     => 8593,
00289         'uArr'     => 8657,
00290         'Ucirc'    => 219,
00291         'ucirc'    => 251,
00292         'Ugrave'   => 217,
00293         'ugrave'   => 249,
00294         'uml'      => 168,
00295         'upsih'    => 978,
00296         'Upsilon'  => 933,
00297         'upsilon'  => 965,
00298         'Uuml'     => 220,
00299         'uuml'     => 252,
00300         'weierp'   => 8472,
00301         'Xi'       => 926,
00302         'xi'       => 958,
00303         'Yacute'   => 221,
00304         'yacute'   => 253,
00305         'yen'      => 165,
00306         'Yuml'     => 376,
00307         'yuml'     => 255,
00308         'Zeta'     => 918,
00309         'zeta'     => 950,
00310         'zwj'      => 8205,
00311         'zwnj'     => 8204
00312     );
00313
00317     private static $htmlEntityAliases = array(
00318         'רלמ' => 'rlm',
00319         'رلم' => 'rlm',
00320     );
00321
00325     private static $attribsRegex;
00326
00332     static function getAttribsRegex() {
00333         if ( self::$attribsRegex === null ) {
00334             $attribFirst = '[:A-Z_a-z0-9]';
00335             $attrib = '[:A-Z_a-z-.0-9]';
00336             $space = '[\x09\x0a\x0d\x20]';
00337             self::$attribsRegex =
00338                 "/(?:^|$space)({$attribFirst}{$attrib}*)
00339                   ($space*=$space*
00340                     (?:
00341                      # The attribute value: quoted or alone
00342                       \"([^<\"]*)\"
00343                      | '([^<']*)'
00344                      |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00345                      |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00346                                          # colors are specified like this.
00347                                          # We'll be normalizing it.
00348                     )
00349                 )?(?=$space|\$)/sx";
00350         }
00351         return self::$attribsRegex;
00352     }
00353
00366     static function removeHTMLtags( $text, $processCallback = null,
00367         $args = array(), $extratags = array(), $removetags = array()
00368     ) {
00369         global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
00370
00371         static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00372             $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
00373
00374         wfProfileIn( __METHOD__ );
00375
00376         // Base our staticInitialised variable off of the global config state so that if the globals
00377         // are changed (like in the screwed up test system) we will re-initialise the settings.
00378         $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
00379         if ( !$staticInitialised || $staticInitialised != $globalContext ) {
00380
00381             $htmlpairsStatic = array( # Tags that must be closed
00382                 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00383                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00384                 'strike', 'strong', 'tt', 'var', 'div', 'center',
00385                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00386                 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
00387                 'kbd', 'samp', 'data', 'time', 'mark'
00388             );
00389             $htmlsingle = array(
00390                 'br', 'wbr', 'hr', 'li', 'dt', 'dd'
00391             );
00392             $htmlsingleonly = array( # Elements that cannot have close tags
00393                 'br', 'wbr', 'hr'
00394             );
00395             if ( $wgAllowMicrodataAttributes ) {
00396                 $htmlsingle[] = $htmlsingleonly[] = 'meta';
00397                 $htmlsingle[] = $htmlsingleonly[] = 'link';
00398             }
00399             $htmlnest = array( # Tags that can be nested--??
00400                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00401                 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
00402                 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
00403             );
00404             $tabletags = array( # Can only appear inside table, we will close them
00405                 'td', 'th', 'tr',
00406             );
00407             $htmllist = array( # Tags used by list
00408                 'ul', 'ol',
00409             );
00410             $listtags = array( # Tags that can appear in a list
00411                 'li',
00412             );
00413
00414             if ( $wgAllowImageTag ) {
00415                 $htmlsingle[] = 'img';
00416                 $htmlsingleonly[] = 'img';
00417             }
00418
00419             $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
00420             $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
00421
00422             # Convert them all to hashtables for faster lookup
00423             $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00424                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
00425             foreach ( $vars as $var ) {
00426                 $$var = array_flip( $$var );
00427             }
00428             $staticInitialised = $globalContext;
00429         }
00430         # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
00431         $extratags = array_flip( $extratags );
00432         $removetags = array_flip( $removetags );
00433         $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
00434         $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
00435
00436         # Remove HTML comments
00437         $text = Sanitizer::removeHTMLcomments( $text );
00438         $bits = explode( '<', $text );
00439         $text = str_replace( '>', '&gt;', array_shift( $bits ) );
00440         if ( !$wgUseTidy ) {
00441             $tagstack = $tablestack = array();
00442             foreach ( $bits as $x ) {
00443                 $regs = array();
00444                 # $slash: Does the current element start with a '/'?
00445                 # $t: Current element name
00446                 # $params: String between element name and >
00447                 # $brace: Ending '>' or '/>'
00448                 # $rest: Everything until the next element of $bits
00449                 if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00450                     list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00451                 } else {
00452                     $slash = $t = $params = $brace = $rest = null;
00453                 }
00454
00455                 $badtag = false;
00456                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00457                     # Check our stack
00458                     if ( $slash && isset( $htmlsingleonly[$t] ) ) {
00459                         $badtag = true;
00460                     } elseif ( $slash ) {
00461                         # Closing a tag... is it the one we just opened?
00462                         $ot = @array_pop( $tagstack );
00463                         if ( $ot != $t ) {
00464                             if ( isset( $htmlsingleallowed[$ot] ) ) {
00465                                 # Pop all elements with an optional close tag
00466                                 # and see if we find a match below them
00467                                 $optstack = array();
00468                                 array_push( $optstack, $ot );
00469                                 wfSuppressWarnings();
00470                                 $ot = array_pop( $tagstack );
00471                                 wfRestoreWarnings();
00472                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
00473                                     array_push( $optstack, $ot );
00474                                     wfSuppressWarnings();
00475                                     $ot = array_pop( $tagstack );
00476                                     wfRestoreWarnings();
00477                                 }
00478                                 if ( $t != $ot ) {
00479                                     # No match. Push the optional elements back again
00480                                     $badtag = true;
00481                                     wfSuppressWarnings();
00482                                     $ot = array_pop( $optstack );
00483                                     wfRestoreWarnings();
00484                                     while ( $ot ) {
00485                                         array_push( $tagstack, $ot );
00486                                         wfSuppressWarnings();
00487                                         $ot = array_pop( $optstack );
00488                                         wfRestoreWarnings();
00489                                     }
00490                                 }
00491                             } else {
00492                                 @array_push( $tagstack, $ot );
00493                                 # <li> can be nested in <ul> or <ol>, skip those cases:
00494                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
00495                                     $badtag = true;
00496                                 }
00497                             }
00498                         } else {
00499                             if ( $t == 'table' ) {
00500                                 $tagstack = array_pop( $tablestack );
00501                             }
00502                         }
00503                         $newparams = '';
00504                     } else {
00505                         # Keep track for later
00506                         if ( isset( $tabletags[$t] ) &&
00507                         !in_array( 'table', $tagstack ) ) {
00508                             $badtag = true;
00509                         } elseif ( in_array( $t, $tagstack ) &&
00510                         !isset( $htmlnest[$t] ) ) {
00511                             $badtag = true;
00512                         # Is it a self closed htmlpair ? (bug 5487)
00513                         } elseif ( $brace == '/>' &&
00514                         isset( $htmlpairs[$t] ) ) {
00515                             $badtag = true;
00516                         } elseif ( isset( $htmlsingleonly[$t] ) ) {
00517                             # Hack to force empty tag for unclosable elements
00518                             $brace = '/>';
00519                         } elseif ( isset( $htmlsingle[$t] ) ) {
00520                             # Hack to not close $htmlsingle tags
00521                             $brace = null;
00522                             # Still need to push this optionally-closed tag to
00523                             # the tag stack so that we can match end tags
00524                             # instead of marking them as bad.
00525                             array_push( $tagstack, $t );
00526                         } elseif ( isset( $tabletags[$t] )
00527                         && in_array( $t, $tagstack ) ) {
00528                             // New table tag but forgot to close the previous one
00529                             $text .= "</$t>";
00530                         } else {
00531                             if ( $t == 'table' ) {
00532                                 array_push( $tablestack, $tagstack );
00533                                 $tagstack = array();
00534                             }
00535                             array_push( $tagstack, $t );
00536                         }
00537
00538                         # Replace any variables or template parameters with
00539                         # plaintext results.
00540                         if ( is_callable( $processCallback ) ) {
00541                             call_user_func_array( $processCallback, array( &$params, $args ) );
00542                         }
00543
00544                         if ( !Sanitizer::validateTag( $params, $t ) ) {
00545                             $badtag = true;
00546                         }
00547
00548                         # Strip non-approved attributes from the tag
00549                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
00550                     }
00551                     if ( !$badtag ) {
00552                         $rest = str_replace( '>', '&gt;', $rest );
00553                         $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00554                         $text .= "<$slash$t$newparams$close>$rest";
00555                         continue;
00556                     }
00557                 }
00558                 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
00559             }
00560             # Close off any remaining tags
00561             while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
00562                 $text .= "</$t>\n";
00563                 if ( $t == 'table' ) {
00564                     $tagstack = array_pop( $tablestack );
00565                 }
00566             }
00567         } else {
00568             # this might be possible using tidy itself
00569             foreach ( $bits as $x ) {
00570                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00571                 $x, $regs );
00572                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00573                 $badtag = false;
00574                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00575                     if ( is_callable( $processCallback ) ) {
00576                         call_user_func_array( $processCallback, array( &$params, $args ) );
00577                     }
00578
00579                     if ( !Sanitizer::validateTag( $params, $t ) ) {
00580                         $badtag = true;
00581                     }
00582
00583                     $newparams = Sanitizer::fixTagAttributes( $params, $t );
00584                     if ( !$badtag ) {
00585                         $rest = str_replace( '>', '&gt;', $rest );
00586                         $text .= "<$slash$t$newparams$brace$rest";
00587                         continue;
00588                     }
00589                 }
00590                 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
00591             }
00592         }
00593         wfProfileOut( __METHOD__ );
00594         return $text;
00595     }
00596
00607     static function removeHTMLcomments( $text ) {
00608         wfProfileIn( __METHOD__ );
00609         while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
00610             $end = strpos( $text, '-->', $start + 4 );
00611             if ( $end === false ) {
00612                 # Unterminated comment; bail out
00613                 break;
00614             }
00615
00616             $end += 3;
00617
00618             # Trim space and newline if the comment is both
00619             # preceded and followed by a newline
00620             $spaceStart = max( $start - 1, 0 );
00621             $spaceLen = $end - $spaceStart;
00622             while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
00623                 $spaceStart--;
00624                 $spaceLen++;
00625             }
00626             while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
00627                 $spaceLen++;
00628             }
00629             if ( substr( $text, $spaceStart, 1 ) === "\n"
00630                 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
00631                 # Remove the comment, leading and trailing
00632                 # spaces, and leave only one newline.
00633                 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
00634             } else {
00635                 # Remove just the comment.
00636                 $text = substr_replace( $text, '', $start, $end - $start );
00637             }
00638         }
00639         wfProfileOut( __METHOD__ );
00640         return $text;
00641     }
00642
00655     static function validateTag( $params, $element ) {
00656         $params = Sanitizer::decodeTagAttributes( $params );
00657
00658         if ( $element == 'meta' || $element == 'link' ) {
00659             if ( !isset( $params['itemprop'] ) ) {
00660                 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
00661                 return false;
00662             }
00663             if ( $element == 'meta' && !isset( $params['content'] ) ) {
00664                 // <meta> must have a content="" for the itemprop
00665                 return false;
00666             }
00667             if ( $element == 'link' && !isset( $params['href'] ) ) {
00668                 // <link> must have an associated href=""
00669                 return false;
00670             }
00671         }
00672
00673         return true;
00674     }
00675
00691     static function validateTagAttributes( $attribs, $element ) {
00692         return Sanitizer::validateAttributes( $attribs,
00693             Sanitizer::attributeWhitelist( $element ) );
00694     }
00695
00711     static function validateAttributes( $attribs, $whitelist ) {
00712         global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
00713
00714         $whitelist = array_flip( $whitelist );
00715         $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
00716
00717         $out = array();
00718         foreach ( $attribs as $attribute => $value ) {
00719             #allow XML namespace declaration if RDFa is enabled
00720             if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
00721                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00722                     $out[$attribute] = $value;
00723                 }
00724
00725                 continue;
00726             }
00727
00728             # Allow any attribute beginning with "data-"
00729             if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
00730                 continue;
00731             }
00732
00733             # Strip javascript "expression" from stylesheets.
00734             # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00735             if ( $attribute == 'style' ) {
00736                 $value = Sanitizer::checkCss( $value );
00737             }
00738
00739             if ( $attribute === 'id' ) {
00740                 $value = Sanitizer::escapeId( $value, 'noninitial' );
00741             }
00742
00743             # WAI-ARIA
00744             # http://www.w3.org/TR/wai-aria/
00745             # http://www.whatwg.org/html/elements.html#wai-aria
00746             # For now we only support role="presentation" until we work out what roles should be
00747             # usable by content and we ensure that our code explicitly rejects patterns that
00748             # violate HTML5's ARIA restrictions.
00749             if ( $attribute === 'role' && $value !== 'presentation' ) {
00750                 continue;
00751             }
00752
00753             // RDFa and microdata properties allow URLs, URIs and/or CURIs.
00754             // Check them for sanity.
00755             if ( $attribute === 'rel' || $attribute === 'rev'
00756                 # RDFa
00757                 || $attribute === 'about' || $attribute === 'property'
00758                 || $attribute === 'resource' || $attribute === 'datatype'
00759                 || $attribute === 'typeof'
00760                 # HTML5 microdata
00761                 || $attribute === 'itemid' || $attribute === 'itemprop'
00762                 || $attribute === 'itemref' || $attribute === 'itemscope'
00763                 || $attribute === 'itemtype'
00764             ) {
00765                 //Paranoia. Allow "simple" values but suppress javascript
00766                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00767                     continue;
00768                 }
00769             }
00770
00771             # NOTE: even though elements using href/src are not allowed directly, supply
00772             #       validation code that can be used by tag hook handlers, etc
00773             if ( $attribute === 'href' || $attribute === 'src' ) {
00774                 if ( !preg_match( $hrefExp, $value ) ) {
00775                     continue; //drop any href or src attributes not using an allowed protocol.
00776                     // NOTE: this also drops all relative URLs
00777                 }
00778             }
00779
00780             // If this attribute was previously set, override it.
00781             // Output should only have one attribute of each name.
00782             $out[$attribute] = $value;
00783         }
00784
00785         if ( $wgAllowMicrodataAttributes ) {
00786             # itemtype, itemid, itemref don't make sense without itemscope
00787             if ( !array_key_exists( 'itemscope', $out ) ) {
00788                 unset( $out['itemtype'] );
00789                 unset( $out['itemid'] );
00790                 unset( $out['itemref'] );
00791             }
00792             # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
00793         }
00794         return $out;
00795     }
00796
00807     static function mergeAttributes( $a, $b ) {
00808         $out = array_merge( $a, $b );
00809         if ( isset( $a['class'] ) && isset( $b['class'] )
00810             && is_string( $a['class'] ) && is_string( $b['class'] )
00811             && $a['class'] !== $b['class']
00812         ) {
00813             $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00814                 -1, PREG_SPLIT_NO_EMPTY );
00815             $out['class'] = implode( ' ', array_unique( $classes ) );
00816         }
00817         return $out;
00818     }
00819
00829     public static function normalizeCss( $value ) {
00830
00831         // Decode character references like &#123;
00832         $value = Sanitizer::decodeCharReferences( $value );
00833
00834         // Decode escape sequences and line continuation
00835         // See the grammar in the CSS 2 spec, appendix D.
00836         // This has to be done AFTER decoding character references.
00837         // This means it isn't possible for this function to return
00838         // unsanitized escape sequences. It is possible to manufacture
00839         // input that contains character references that decode to
00840         // escape sequences that decode to character references, but
00841         // it's OK for the return value to contain character references
00842         // because the caller is supposed to escape those anyway.
00843         static $decodeRegex;
00844         if ( !$decodeRegex ) {
00845             $space = '[\\x20\\t\\r\\n\\f]';
00846             $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00847             $backslash = '\\\\';
00848             $decodeRegex = "/ $backslash
00849                 (?:
00850                     ($nl) |  # 1. Line continuation
00851                     ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
00852                     (.) | # 3. backslash cancelling special meaning
00853                     () | # 4. backslash at end of string
00854                 )/xu";
00855         }
00856         $value = preg_replace_callback( $decodeRegex,
00857             array( __CLASS__, 'cssDecodeCallback' ), $value );
00858
00859         // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
00860         $value = preg_replace_callback(
00861             '/[！-［］-ｚ]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
00862             function ( $matches ) {
00863                 $cp = utf8ToCodepoint( $matches[0] );
00864                 if ( $cp === false ) {
00865                     return '';
00866                 }
00867                 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
00868             },
00869             $value
00870         );
00871
00872         // Convert more characters IE6 might treat as ascii
00873         // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
00874         $value = str_replace(
00875             array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
00876             array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
00877             $value
00878         );
00879
00880         // Let the value through if it's nothing but a single comment, to
00881         // allow other functions which may reject it to pass some error
00882         // message through.
00883         if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
00884             // Remove any comments; IE gets token splitting wrong
00885             // This must be done AFTER decoding character references and
00886             // escape sequences, because those steps can introduce comments
00887             // This step cannot introduce character references or escape
00888             // sequences, because it replaces comments with spaces rather
00889             // than removing them completely.
00890             $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00891
00892             // Remove anything after a comment-start token, to guard against
00893             // incorrect client implementations.
00894             $commentPos = strpos( $value, '/*' );
00895             if ( $commentPos !== false ) {
00896                 $value = substr( $value, 0, $commentPos );
00897             }
00898         }
00899
00900         // S followed by repeat, iteration, or prolonged sound marks,
00901         // which IE will treat as "ss"
00902         $value = preg_replace(
00903             '/s(?:
00904                 \xE3\x80\xB1 | # U+3031
00905                 \xE3\x82\x9D | # U+309D
00906                 \xE3\x83\xBC | # U+30FC
00907                 \xE3\x83\xBD | # U+30FD
00908                 \xEF\xB9\xBC | # U+FE7C
00909                 \xEF\xB9\xBD | # U+FE7D
00910                 \xEF\xBD\xB0   # U+FF70
00911             )/ix',
00912             'ss',
00913             $value
00914         );
00915
00916         return $value;
00917     }
00918
00919
00938     static function checkCss( $value ) {
00939         $value = self::normalizeCss( $value );
00940
00941         // Reject problematic keywords and control characters
00942         if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
00943             return '/* invalid control char */';
00944         } elseif ( preg_match(
00945             '! expression
00946                 | filter\s*:
00947                 | accelerator\s*:
00948                 | -o-link\s*:
00949                 | -o-link-source\s*:
00950                 | -o-replace\s*:
00951                 | url\s*\(
00952                 | image\s*\(
00953                 | image-set\s*\(
00954             !ix', $value ) ) {
00955             return '/* insecure input */';
00956         }
00957         return $value;
00958     }
00959
00964     static function cssDecodeCallback( $matches ) {
00965         if ( $matches[1] !== '' ) {
00966             // Line continuation
00967             return '';
00968         } elseif ( $matches[2] !== '' ) {
00969             $char = codepointToUtf8( hexdec( $matches[2] ) );
00970         } elseif ( $matches[3] !== '' ) {
00971             $char = $matches[3];
00972         } else {
00973             $char = '\\';
00974         }
00975         if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00976             // These characters need to be escaped in strings
00977             // Clean up the escape sequence to avoid parsing errors by clients
00978             return '\\' . dechex( ord( $char ) ) . ' ';
00979         } else {
00980             // Decode unnecessary escape
00981             return $char;
00982         }
00983     }
00984
01004     static function fixTagAttributes( $text, $element ) {
01005         if ( trim( $text ) == '' ) {
01006             return '';
01007         }
01008
01009         $decoded = Sanitizer::decodeTagAttributes( $text );
01010         $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
01011
01012         return Sanitizer::safeEncodeTagAttributes( $stripped );
01013     }
01014
01020     static function encodeAttribute( $text ) {
01021         $encValue = htmlspecialchars( $text, ENT_QUOTES );
01022
01023         // Whitespace is normalized during attribute decoding,
01024         // so if we've been passed non-spaces we must encode them
01025         // ahead of time or they won't be preserved.
01026         $encValue = strtr( $encValue, array(
01027             "\n" => '&#10;',
01028             "\r" => '&#13;',
01029             "\t" => '&#9;',
01030         ) );
01031
01032         return $encValue;
01033     }
01034
01041     static function safeEncodeAttribute( $text ) {
01042         $encValue = Sanitizer::encodeAttribute( $text );
01043
01044         # Templates and links may be expanded in later parsing,
01045         # creating invalid or dangerous output. Suppress this.
01046         $encValue = strtr( $encValue, array(
01047             '<'    => '&lt;',   // This should never happen,
01048             '>'    => '&gt;',   // we've received invalid input
01049             '"'    => '&quot;', // which should have been escaped.
01050             '{'    => '&#123;',
01051             '['    => '&#91;',
01052             "''"   => '&#39;&#39;',
01053             'ISBN' => '&#73;SBN',
01054             'RFC'  => '&#82;FC',
01055             'PMID' => '&#80;MID',
01056             '|'    => '&#124;',
01057             '__'   => '&#95;_',
01058         ) );
01059
01060         # Stupid hack
01061         $encValue = preg_replace_callback(
01062             '/((?i)' . wfUrlProtocols() . ')/',
01063             array( 'Sanitizer', 'armorLinksCallback' ),
01064             $encValue );
01065         return $encValue;
01066     }
01067
01099     static function escapeId( $id, $options = array() ) {
01100         global $wgExperimentalHtmlIds;
01101         $options = (array)$options;
01102
01103         if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
01104             $id = Sanitizer::decodeCharReferences( $id );
01105             $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
01106             $id = trim( $id, '_' );
01107             if ( $id === '' ) {
01108                 # Must have been all whitespace to start with.
01109                 return '_';
01110             } else {
01111                 return $id;
01112             }
01113         }
01114
01115         # HTML4-style escaping
01116         static $replace = array(
01117             '%3A' => ':',
01118             '%' => '.'
01119         );
01120
01121         $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
01122         $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
01123
01124         if ( !preg_match( '/^[a-zA-Z]/', $id )
01125         && !in_array( 'noninitial', $options ) ) {
01126             // Initial character must be a letter!
01127             $id = "x$id";
01128         }
01129         return $id;
01130     }
01131
01143     static function escapeClass( $class ) {
01144         // Convert ugly stuff to underscores and kill underscores in ugly places
01145         return rtrim( preg_replace(
01146             array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
01147             '_',
01148             $class ), '_' );
01149     }
01150
01158     static function escapeHtmlAllowEntities( $html ) {
01159         $html = Sanitizer::decodeCharReferences( $html );
01160         # It seems wise to escape ' as well as ", as a matter of course.  Can't
01161         # hurt.
01162         $html = htmlspecialchars( $html, ENT_QUOTES );
01163         return $html;
01164     }
01165
01171     private static function armorLinksCallback( $matches ) {
01172         return str_replace( ':', '&#58;', $matches[1] );
01173     }
01174
01183     public static function decodeTagAttributes( $text ) {
01184         if ( trim( $text ) == '' ) {
01185             return array();
01186         }
01187
01188         $attribs = array();
01189         $pairs = array();
01190         if ( !preg_match_all(
01191             self::getAttribsRegex(),
01192             $text,
01193             $pairs,
01194             PREG_SET_ORDER ) ) {
01195             return $attribs;
01196         }
01197
01198         foreach ( $pairs as $set ) {
01199             $attribute = strtolower( $set[1] );
01200             $value = Sanitizer::getTagAttributeCallback( $set );
01201
01202             // Normalize whitespace
01203             $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
01204             $value = trim( $value );
01205
01206             // Decode character references
01207             $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
01208         }
01209         return $attribs;
01210     }
01211
01219     public static function safeEncodeTagAttributes( $assoc_array ) {
01220         $attribs = array();
01221         foreach ( $assoc_array as $attribute => $value ) {
01222             $encAttribute = htmlspecialchars( $attribute );
01223             $encValue = Sanitizer::safeEncodeAttribute( $value );
01224
01225             $attribs[] = "$encAttribute=\"$encValue\"";
01226         }
01227         return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
01228     }
01229
01238     private static function getTagAttributeCallback( $set ) {
01239         if ( isset( $set[6] ) ) {
01240             # Illegal #XXXXXX color with no quotes.
01241             return $set[6];
01242         } elseif ( isset( $set[5] ) ) {
01243             # No quotes.
01244             return $set[5];
01245         } elseif ( isset( $set[4] ) ) {
01246             # Single-quoted
01247             return $set[4];
01248         } elseif ( isset( $set[3] ) ) {
01249             # Double-quoted
01250             return $set[3];
01251         } elseif ( !isset( $set[2] ) ) {
01252             # In XHTML, attributes must have a value.
01253             # For 'reduced' form, return explicitly the attribute name here.
01254             return $set[1];
01255         } else {
01256             throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
01257         }
01258     }
01259
01272     private static function normalizeAttributeValue( $text ) {
01273         return str_replace( '"', '&quot;',
01274             self::normalizeWhitespace(
01275                 Sanitizer::normalizeCharReferences( $text ) ) );
01276     }
01277
01282     private static function normalizeWhitespace( $text ) {
01283         return preg_replace(
01284             '/\r\n|[\x20\x0d\x0a\x09]/',
01285             ' ',
01286             $text );
01287     }
01288
01297     static function normalizeSectionNameWhitespace( $section ) {
01298         return trim( preg_replace( '/[ _]+/', ' ', $section ) );
01299     }
01300
01316     static function normalizeCharReferences( $text ) {
01317         return preg_replace_callback(
01318             self::CHAR_REFS_REGEX,
01319             array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01320             $text );
01321     }
01322
01327     static function normalizeCharReferencesCallback( $matches ) {
01328         $ret = null;
01329         if ( $matches[1] != '' ) {
01330             $ret = Sanitizer::normalizeEntity( $matches[1] );
01331         } elseif ( $matches[2] != '' ) {
01332             $ret = Sanitizer::decCharReference( $matches[2] );
01333         } elseif ( $matches[3] != '' ) {
01334             $ret = Sanitizer::hexCharReference( $matches[3] );
01335         }
01336         if ( is_null( $ret ) ) {
01337             return htmlspecialchars( $matches[0] );
01338         } else {
01339             return $ret;
01340         }
01341     }
01342
01353     static function normalizeEntity( $name ) {
01354         if ( isset( self::$htmlEntityAliases[$name] ) ) {
01355             return '&' . self::$htmlEntityAliases[$name] . ';';
01356         } elseif ( in_array( $name,
01357         array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
01358             return "&$name;";
01359         } elseif ( isset( self::$htmlEntities[$name] ) ) {
01360             return '&#' . self::$htmlEntities[$name] . ';';
01361         } else {
01362             return "&amp;$name;";
01363         }
01364     }
01365
01370     static function decCharReference( $codepoint ) {
01371         $point = intval( $codepoint );
01372         if ( Sanitizer::validateCodepoint( $point ) ) {
01373             return sprintf( '&#%d;', $point );
01374         } else {
01375             return null;
01376         }
01377     }
01378
01383     static function hexCharReference( $codepoint ) {
01384         $point = hexdec( $codepoint );
01385         if ( Sanitizer::validateCodepoint( $point ) ) {
01386             return sprintf( '&#x%x;', $point );
01387         } else {
01388             return null;
01389         }
01390     }
01391
01397     private static function validateCodepoint( $codepoint ) {
01398         return $codepoint == 0x09
01399             || $codepoint == 0x0a
01400             || $codepoint == 0x0d
01401             || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
01402             || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
01403             || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
01404     }
01405
01413     public static function decodeCharReferences( $text ) {
01414         return preg_replace_callback(
01415             self::CHAR_REFS_REGEX,
01416             array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01417             $text );
01418     }
01419
01430     public static function decodeCharReferencesAndNormalize( $text ) {
01431         global $wgContLang;
01432         $text = preg_replace_callback(
01433             self::CHAR_REFS_REGEX,
01434             array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01435             $text, /* limit */ -1, $count );
01436
01437         if ( $count ) {
01438             return $wgContLang->normalize( $text );
01439         } else {
01440             return $text;
01441         }
01442     }
01443
01448     static function decodeCharReferencesCallback( $matches ) {
01449         if ( $matches[1] != '' ) {
01450             return Sanitizer::decodeEntity( $matches[1] );
01451         } elseif ( $matches[2] != '' ) {
01452             return Sanitizer::decodeChar( intval( $matches[2] ) );
01453         } elseif ( $matches[3] != '' ) {
01454             return Sanitizer::decodeChar( hexdec( $matches[3] ) );
01455         }
01456         # Last case should be an ampersand by itself
01457         return $matches[0];
01458     }
01459
01467     static function decodeChar( $codepoint ) {
01468         if ( Sanitizer::validateCodepoint( $codepoint ) ) {
01469             return codepointToUtf8( $codepoint );
01470         } else {
01471             return UTF8_REPLACEMENT;
01472         }
01473     }
01474
01483     static function decodeEntity( $name ) {
01484         if ( isset( self::$htmlEntityAliases[$name] ) ) {
01485             $name = self::$htmlEntityAliases[$name];
01486         }
01487         if ( isset( self::$htmlEntities[$name] ) ) {
01488             return codepointToUtf8( self::$htmlEntities[$name] );
01489         } else {
01490             return "&$name;";
01491         }
01492     }
01493
01500     static function attributeWhitelist( $element ) {
01501         $list = Sanitizer::setupAttributeWhitelist();
01502         return isset( $list[$element] )
01503             ? $list[$element]
01504             : array();
01505     }
01506
01512     static function setupAttributeWhitelist() {
01513         global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
01514
01515         static $whitelist, $staticInitialised;
01516         $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
01517
01518         if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
01519             return $whitelist;
01520         }
01521
01522         $common = array(
01523             # HTML
01524             'id',
01525             'class',
01526             'style',
01527             'lang',
01528             'dir',
01529             'title',
01530
01531             # WAI-ARIA
01532             'role',
01533         );
01534
01535         if ( $wgAllowRdfaAttributes ) {
01536             # RDFa attributes as specified in section 9 of
01537             # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
01538             $common = array_merge( $common, array(
01539                 'about', 'property', 'resource', 'datatype', 'typeof',
01540             ) );
01541         }
01542
01543         if ( $wgAllowMicrodataAttributes ) {
01544             # add HTML5 microdata tags as specified by
01545             # http://www.whatwg.org/html/microdata.html#the-microdata-model
01546             $common = array_merge( $common, array(
01547                 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
01548             ) );
01549         }
01550
01551         $block = array_merge( $common, array( 'align' ) );
01552         $tablealign = array( 'align', 'valign' );
01553         $tablecell = array(
01554             'abbr',
01555             'axis',
01556             'headers',
01557             'scope',
01558             'rowspan',
01559             'colspan',
01560             'nowrap', # deprecated
01561             'width', # deprecated
01562             'height', # deprecated
01563             'bgcolor', # deprecated
01564         );
01565
01566         # Numbers refer to sections in HTML 4.01 standard describing the element.
01567         # See: http://www.w3.org/TR/html4/
01568         $whitelist = array(
01569             # 7.5.4
01570             'div'        => $block,
01571             'center'     => $common, # deprecated
01572             'span'       => $common,
01573
01574             # 7.5.5
01575             'h1'         => $block,
01576             'h2'         => $block,
01577             'h3'         => $block,
01578             'h4'         => $block,
01579             'h5'         => $block,
01580             'h6'         => $block,
01581
01582             # 7.5.6
01583             # address
01584
01585             # 8.2.4
01586             'bdo'        => $common,
01587
01588             # 9.2.1
01589             'em'         => $common,
01590             'strong'     => $common,
01591             'cite'       => $common,
01592             'dfn'        => $common,
01593             'code'       => $common,
01594             'samp'       => $common,
01595             'kbd'        => $common,
01596             'var'        => $common,
01597             'abbr'       => $common,
01598             # acronym
01599
01600             # 9.2.2
01601             'blockquote' => array_merge( $common, array( 'cite' ) ),
01602             'q'          => array_merge( $common, array( 'cite' ) ),
01603
01604             # 9.2.3
01605             'sub'        => $common,
01606             'sup'        => $common,
01607
01608             # 9.3.1
01609             'p'          => $block,
01610
01611             # 9.3.2
01612             'br'         => array_merge( $common, array( 'clear' ) ),
01613
01614             # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
01615             'wbr'        => $common,
01616
01617             # 9.3.4
01618             'pre'        => array_merge( $common, array( 'width' ) ),
01619
01620             # 9.4
01621             'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01622             'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01623
01624             # 10.2
01625             'ul'         => array_merge( $common, array( 'type' ) ),
01626             'ol'         => array_merge( $common, array( 'type', 'start' ) ),
01627             'li'         => array_merge( $common, array( 'type', 'value' ) ),
01628
01629             # 10.3
01630             'dl'         => $common,
01631             'dd'         => $common,
01632             'dt'         => $common,
01633
01634             # 11.2.1
01635             'table'      => array_merge( $common,
01636                                 array( 'summary', 'width', 'border', 'frame',
01637                                         'rules', 'cellspacing', 'cellpadding',
01638                                         'align', 'bgcolor',
01639                                 ) ),
01640
01641             # 11.2.2
01642             'caption'    => $block,
01643
01644             # 11.2.3
01645             'thead'      => $common,
01646             'tfoot'      => $common,
01647             'tbody'      => $common,
01648
01649             # 11.2.4
01650             'colgroup'   => array_merge( $common, array( 'span' ) ),
01651             'col'        => array_merge( $common, array( 'span' ) ),
01652
01653             # 11.2.5
01654             'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01655
01656             # 11.2.6
01657             'td'         => array_merge( $common, $tablecell, $tablealign ),
01658             'th'         => array_merge( $common, $tablecell, $tablealign ),
01659
01660             # 12.2
01661             # NOTE: <a> is not allowed directly, but the attrib
01662             # whitelist is used from the Parser object
01663             'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
01664
01665             # 13.2
01666             # Not usually allowed, but may be used for extension-style hooks
01667             # such as <math> when it is rasterized, or if $wgAllowImageTag is
01668             # true
01669             'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
01670
01671             # 15.2.1
01672             'tt'         => $common,
01673             'b'          => $common,
01674             'i'          => $common,
01675             'big'        => $common,
01676             'small'      => $common,
01677             'strike'     => $common,
01678             's'          => $common,
01679             'u'          => $common,
01680
01681             # 15.2.2
01682             'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
01683             # basefont
01684
01685             # 15.3
01686             'hr'         => array_merge( $common, array( 'width' ) ),
01687
01688             # HTML Ruby annotation text module, simple ruby only.
01689             # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
01690             'ruby'       => $common,
01691             # rbc
01692             # rtc
01693             'rb'         => $common,
01694             'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
01695             'rp'         => $common,
01696
01697             # MathML root element, where used for extensions
01698             # 'title' may not be 100% valid here; it's XHTML
01699             # http://www.w3.org/TR/REC-MathML/
01700             'math'       => array( 'class', 'style', 'id', 'title' ),
01701
01702             # HTML 5 section 4.6
01703             'bdi' => $common,
01704
01705             # HTML5 elements, defined by:
01706             # http://www.whatwg.org/html/
01707             'data' => array_merge( $common, array( 'value' ) ),
01708             'time' => array_merge( $common, array( 'datetime' ) ),
01709             'mark' => $common,
01710
01711             // meta and link are only permitted by removeHTMLtags when Microdata
01712             // is enabled so we don't bother adding a conditional to hide these
01713             // Also meta and link are only valid in WikiText as Microdata elements
01714             // (ie: validateTag rejects tags missing the attributes needed for Microdata)
01715             // So we don't bother including $common attributes that have no purpose.
01716             'meta' => array( 'itemprop', 'content' ),
01717             'link' => array( 'itemprop', 'href' ),
01718         );
01719
01720         $staticInitialised = $globalContext;
01721
01722         return $whitelist;
01723     }
01724
01735     static function stripAllTags( $text ) {
01736         # Actual <tags>
01737         $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01738
01739         # Normalize &entities and whitespace
01740         $text = self::decodeCharReferences( $text );
01741         $text = self::normalizeWhitespace( $text );
01742
01743         return $text;
01744     }
01745
01755     static function hackDocType() {
01756         $out = "<!DOCTYPE html [\n";
01757         foreach ( self::$htmlEntities as $entity => $codepoint ) {
01758             $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01759         }
01760         $out .= "]>\n";
01761         return $out;
01762     }
01763
01768     static function cleanUrl( $url ) {
01769         # Normalize any HTML entities in input. They will be
01770         # re-escaped by makeExternalLink().
01771         $url = Sanitizer::decodeCharReferences( $url );
01772
01773         # Escape any control characters introduced by the above step
01774         $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
01775             array( __CLASS__, 'cleanUrlCallback' ), $url );
01776
01777         # Validate hostname portion
01778         $matches = array();
01779         if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01780             list( /* $whole */, $protocol, $host, $rest ) = $matches;
01781
01782             // Characters that will be ignored in IDNs.
01783             // http://tools.ietf.org/html/3454#section-3.1
01784             // Strip them before further processing so blacklists and such work.
01785             $strip = "/
01786                 \\s|          # general whitespace
01787                 \xc2\xad|     # 00ad SOFT HYPHEN
01788                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01789                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01790                 \xe2\x81\xa0| # 2060 WORD JOINER
01791                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01792                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
01793                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01794                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01795                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01796                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01797                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01798                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
01799                 /xuD";
01800
01801             $host = preg_replace( $strip, '', $host );
01802
01803             // @todo FIXME: Validate hostnames here
01804
01805             return $protocol . $host . $rest;
01806         } else {
01807             return $url;
01808         }
01809     }
01810
01815     static function cleanUrlCallback( $matches ) {
01816         return urlencode( $matches[0] );
01817     }
01818
01847     public static function validateEmail( $addr ) {
01848         $result = null;
01849         if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
01850             return $result;
01851         }
01852
01853         // Please note strings below are enclosed in brackets [], this make the
01854         // hyphen "-" a range indicator. Hence it is double backslashed below.
01855         // See bug 26948
01856         $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
01857         $rfc1034_ldh_str = "a-z0-9\\-";
01858
01859         $html5_email_regexp = "/
01860         ^                      # start of string
01861         [$rfc5322_atext\\.]+    # user part which is liberal :p
01862         @                      # 'apostrophe'
01863         [$rfc1034_ldh_str]+       # First domain part
01864         (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
01865         $                      # End of string
01866         /ix"; // case Insensitive, eXtended
01867
01868         return (bool)preg_match( $html5_email_regexp, $addr );
01869     }
01870 }