MediaWiki  REL1_22
Sanitizer.php
Go to the documentation of this file.
00001 <?php
00031 class Sanitizer {
00036     const CHAR_REFS_REGEX =
00037         '/&([A-Za-z0-9\x80-\xff]+);
00038          |&\#([0-9]+);
00039          |&\#[xX]([0-9A-Fa-f]+);
00040          |(&)/x';
00041 
00050     const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
00051     const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
00052 
00058     private static $htmlEntities = array(
00059         'Aacute'   => 193,
00060         'aacute'   => 225,
00061         'Acirc'    => 194,
00062         'acirc'    => 226,
00063         'acute'    => 180,
00064         'AElig'    => 198,
00065         'aelig'    => 230,
00066         'Agrave'   => 192,
00067         'agrave'   => 224,
00068         'alefsym'  => 8501,
00069         'Alpha'    => 913,
00070         'alpha'    => 945,
00071         'amp'      => 38,
00072         'and'      => 8743,
00073         'ang'      => 8736,
00074         'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
00075         'Aring'    => 197,
00076         'aring'    => 229,
00077         'asymp'    => 8776,
00078         'Atilde'   => 195,
00079         'atilde'   => 227,
00080         'Auml'     => 196,
00081         'auml'     => 228,
00082         'bdquo'    => 8222,
00083         'Beta'     => 914,
00084         'beta'     => 946,
00085         'brvbar'   => 166,
00086         'bull'     => 8226,
00087         'cap'      => 8745,
00088         'Ccedil'   => 199,
00089         'ccedil'   => 231,
00090         'cedil'    => 184,
00091         'cent'     => 162,
00092         'Chi'      => 935,
00093         'chi'      => 967,
00094         'circ'     => 710,
00095         'clubs'    => 9827,
00096         'cong'     => 8773,
00097         'copy'     => 169,
00098         'crarr'    => 8629,
00099         'cup'      => 8746,
00100         'curren'   => 164,
00101         'dagger'   => 8224,
00102         'Dagger'   => 8225,
00103         'darr'     => 8595,
00104         'dArr'     => 8659,
00105         'deg'      => 176,
00106         'Delta'    => 916,
00107         'delta'    => 948,
00108         'diams'    => 9830,
00109         'divide'   => 247,
00110         'Eacute'   => 201,
00111         'eacute'   => 233,
00112         'Ecirc'    => 202,
00113         'ecirc'    => 234,
00114         'Egrave'   => 200,
00115         'egrave'   => 232,
00116         'empty'    => 8709,
00117         'emsp'     => 8195,
00118         'ensp'     => 8194,
00119         'Epsilon'  => 917,
00120         'epsilon'  => 949,
00121         'equiv'    => 8801,
00122         'Eta'      => 919,
00123         'eta'      => 951,
00124         'ETH'      => 208,
00125         'eth'      => 240,
00126         'Euml'     => 203,
00127         'euml'     => 235,
00128         'euro'     => 8364,
00129         'exist'    => 8707,
00130         'fnof'     => 402,
00131         'forall'   => 8704,
00132         'frac12'   => 189,
00133         'frac14'   => 188,
00134         'frac34'   => 190,
00135         'frasl'    => 8260,
00136         'Gamma'    => 915,
00137         'gamma'    => 947,
00138         'ge'       => 8805,
00139         'gt'       => 62,
00140         'harr'     => 8596,
00141         'hArr'     => 8660,
00142         'hearts'   => 9829,
00143         'hellip'   => 8230,
00144         'Iacute'   => 205,
00145         'iacute'   => 237,
00146         'Icirc'    => 206,
00147         'icirc'    => 238,
00148         'iexcl'    => 161,
00149         'Igrave'   => 204,
00150         'igrave'   => 236,
00151         'image'    => 8465,
00152         'infin'    => 8734,
00153         'int'      => 8747,
00154         'Iota'     => 921,
00155         'iota'     => 953,
00156         'iquest'   => 191,
00157         'isin'     => 8712,
00158         'Iuml'     => 207,
00159         'iuml'     => 239,
00160         'Kappa'    => 922,
00161         'kappa'    => 954,
00162         'Lambda'   => 923,
00163         'lambda'   => 955,
00164         'lang'     => 9001,
00165         'laquo'    => 171,
00166         'larr'     => 8592,
00167         'lArr'     => 8656,
00168         'lceil'    => 8968,
00169         'ldquo'    => 8220,
00170         'le'       => 8804,
00171         'lfloor'   => 8970,
00172         'lowast'   => 8727,
00173         'loz'      => 9674,
00174         'lrm'      => 8206,
00175         'lsaquo'   => 8249,
00176         'lsquo'    => 8216,
00177         'lt'       => 60,
00178         'macr'     => 175,
00179         'mdash'    => 8212,
00180         'micro'    => 181,
00181         'middot'   => 183,
00182         'minus'    => 8722,
00183         'Mu'       => 924,
00184         'mu'       => 956,
00185         'nabla'    => 8711,
00186         'nbsp'     => 160,
00187         'ndash'    => 8211,
00188         'ne'       => 8800,
00189         'ni'       => 8715,
00190         'not'      => 172,
00191         'notin'    => 8713,
00192         'nsub'     => 8836,
00193         'Ntilde'   => 209,
00194         'ntilde'   => 241,
00195         'Nu'       => 925,
00196         'nu'       => 957,
00197         'Oacute'   => 211,
00198         'oacute'   => 243,
00199         'Ocirc'    => 212,
00200         'ocirc'    => 244,
00201         'OElig'    => 338,
00202         'oelig'    => 339,
00203         'Ograve'   => 210,
00204         'ograve'   => 242,
00205         'oline'    => 8254,
00206         'Omega'    => 937,
00207         'omega'    => 969,
00208         'Omicron'  => 927,
00209         'omicron'  => 959,
00210         'oplus'    => 8853,
00211         'or'       => 8744,
00212         'ordf'     => 170,
00213         'ordm'     => 186,
00214         'Oslash'   => 216,
00215         'oslash'   => 248,
00216         'Otilde'   => 213,
00217         'otilde'   => 245,
00218         'otimes'   => 8855,
00219         'Ouml'     => 214,
00220         'ouml'     => 246,
00221         'para'     => 182,
00222         'part'     => 8706,
00223         'permil'   => 8240,
00224         'perp'     => 8869,
00225         'Phi'      => 934,
00226         'phi'      => 966,
00227         'Pi'       => 928,
00228         'pi'       => 960,
00229         'piv'      => 982,
00230         'plusmn'   => 177,
00231         'pound'    => 163,
00232         'prime'    => 8242,
00233         'Prime'    => 8243,
00234         'prod'     => 8719,
00235         'prop'     => 8733,
00236         'Psi'      => 936,
00237         'psi'      => 968,
00238         'quot'     => 34,
00239         'radic'    => 8730,
00240         'rang'     => 9002,
00241         'raquo'    => 187,
00242         'rarr'     => 8594,
00243         'rArr'     => 8658,
00244         'rceil'    => 8969,
00245         'rdquo'    => 8221,
00246         'real'     => 8476,
00247         'reg'      => 174,
00248         'rfloor'   => 8971,
00249         'Rho'      => 929,
00250         'rho'      => 961,
00251         'rlm'      => 8207,
00252         'rsaquo'   => 8250,
00253         'rsquo'    => 8217,
00254         'sbquo'    => 8218,
00255         'Scaron'   => 352,
00256         'scaron'   => 353,
00257         'sdot'     => 8901,
00258         'sect'     => 167,
00259         'shy'      => 173,
00260         'Sigma'    => 931,
00261         'sigma'    => 963,
00262         'sigmaf'   => 962,
00263         'sim'      => 8764,
00264         'spades'   => 9824,
00265         'sub'      => 8834,
00266         'sube'     => 8838,
00267         'sum'      => 8721,
00268         'sup'      => 8835,
00269         'sup1'     => 185,
00270         'sup2'     => 178,
00271         'sup3'     => 179,
00272         'supe'     => 8839,
00273         'szlig'    => 223,
00274         'Tau'      => 932,
00275         'tau'      => 964,
00276         'there4'   => 8756,
00277         'Theta'    => 920,
00278         'theta'    => 952,
00279         'thetasym' => 977,
00280         'thinsp'   => 8201,
00281         'THORN'    => 222,
00282         'thorn'    => 254,
00283         'tilde'    => 732,
00284         'times'    => 215,
00285         'trade'    => 8482,
00286         'Uacute'   => 218,
00287         'uacute'   => 250,
00288         'uarr'     => 8593,
00289         'uArr'     => 8657,
00290         'Ucirc'    => 219,
00291         'ucirc'    => 251,
00292         'Ugrave'   => 217,
00293         'ugrave'   => 249,
00294         'uml'      => 168,
00295         'upsih'    => 978,
00296         'Upsilon'  => 933,
00297         'upsilon'  => 965,
00298         'Uuml'     => 220,
00299         'uuml'     => 252,
00300         'weierp'   => 8472,
00301         'Xi'       => 926,
00302         'xi'       => 958,
00303         'Yacute'   => 221,
00304         'yacute'   => 253,
00305         'yen'      => 165,
00306         'Yuml'     => 376,
00307         'yuml'     => 255,
00308         'Zeta'     => 918,
00309         'zeta'     => 950,
00310         'zwj'      => 8205,
00311         'zwnj'     => 8204
00312     );
00313 
00317     private static $htmlEntityAliases = array(
00318         'רלמ' => 'rlm',
00319         'رلم' => 'rlm',
00320     );
00321 
00325     private static $attribsRegex;
00326 
00332     static function getAttribsRegex() {
00333         if ( self::$attribsRegex === null ) {
00334             $attribFirst = '[:A-Z_a-z0-9]';
00335             $attrib = '[:A-Z_a-z-.0-9]';
00336             $space = '[\x09\x0a\x0d\x20]';
00337             self::$attribsRegex =
00338                 "/(?:^|$space)({$attribFirst}{$attrib}*)
00339                   ($space*=$space*
00340                     (?:
00341                      # The attribute value: quoted or alone
00342                       \"([^<\"]*)\"
00343                      | '([^<']*)'
00344                      |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00345                      |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00346                                          # colors are specified like this.
00347                                          # We'll be normalizing it.
00348                     )
00349                 )?(?=$space|\$)/sx";
00350         }
00351         return self::$attribsRegex;
00352     }
00353 
00366     static function removeHTMLtags( $text, $processCallback = null,
00367         $args = array(), $extratags = array(), $removetags = array()
00368     ) {
00369         global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
00370 
00371         static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00372             $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
00373 
00374         wfProfileIn( __METHOD__ );
00375 
00376         // Base our staticInitialised variable off of the global config state so that if the globals
00377         // are changed (like in the screwed up test system) we will re-initialise the settings.
00378         $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
00379         if ( !$staticInitialised || $staticInitialised != $globalContext ) {
00380 
00381             $htmlpairsStatic = array( # Tags that must be closed
00382                 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00383                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00384                 'strike', 'strong', 'tt', 'var', 'div', 'center',
00385                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00386                 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
00387                 'kbd', 'samp', 'data', 'time', 'mark'
00388             );
00389             $htmlsingle = array(
00390                 'br', 'wbr', 'hr', 'li', 'dt', 'dd'
00391             );
00392             $htmlsingleonly = array( # Elements that cannot have close tags
00393                 'br', 'wbr', 'hr'
00394             );
00395             if ( $wgAllowMicrodataAttributes ) {
00396                 $htmlsingle[] = $htmlsingleonly[] = 'meta';
00397                 $htmlsingle[] = $htmlsingleonly[] = 'link';
00398             }
00399             $htmlnest = array( # Tags that can be nested--??
00400                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00401                 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
00402                 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
00403             );
00404             $tabletags = array( # Can only appear inside table, we will close them
00405                 'td', 'th', 'tr',
00406             );
00407             $htmllist = array( # Tags used by list
00408                 'ul', 'ol',
00409             );
00410             $listtags = array( # Tags that can appear in a list
00411                 'li',
00412             );
00413 
00414             if ( $wgAllowImageTag ) {
00415                 $htmlsingle[] = 'img';
00416                 $htmlsingleonly[] = 'img';
00417             }
00418 
00419             $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
00420             $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
00421 
00422             # Convert them all to hashtables for faster lookup
00423             $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00424                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
00425             foreach ( $vars as $var ) {
00426                 $$var = array_flip( $$var );
00427             }
00428             $staticInitialised = $globalContext;
00429         }
00430         # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
00431         $extratags = array_flip( $extratags );
00432         $removetags = array_flip( $removetags );
00433         $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
00434         $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
00435 
00436         # Remove HTML comments
00437         $text = Sanitizer::removeHTMLcomments( $text );
00438         $bits = explode( '<', $text );
00439         $text = str_replace( '>', '&gt;', array_shift( $bits ) );
00440         if ( !$wgUseTidy ) {
00441             $tagstack = $tablestack = array();
00442             foreach ( $bits as $x ) {
00443                 $regs = array();
00444                 # $slash: Does the current element start with a '/'?
00445                 # $t: Current element name
00446                 # $params: String between element name and >
00447                 # $brace: Ending '>' or '/>'
00448                 # $rest: Everything until the next element of $bits
00449                 if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00450                     list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00451                 } else {
00452                     $slash = $t = $params = $brace = $rest = null;
00453                 }
00454 
00455                 $badtag = false;
00456                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00457                     # Check our stack
00458                     if ( $slash && isset( $htmlsingleonly[$t] ) ) {
00459                         $badtag = true;
00460                     } elseif ( $slash ) {
00461                         # Closing a tag... is it the one we just opened?
00462                         $ot = @array_pop( $tagstack );
00463                         if ( $ot != $t ) {
00464                             if ( isset( $htmlsingleallowed[$ot] ) ) {
00465                                 # Pop all elements with an optional close tag
00466                                 # and see if we find a match below them
00467                                 $optstack = array();
00468                                 array_push( $optstack, $ot );
00469                                 wfSuppressWarnings();
00470                                 $ot = array_pop( $tagstack );
00471                                 wfRestoreWarnings();
00472                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
00473                                     array_push( $optstack, $ot );
00474                                     wfSuppressWarnings();
00475                                     $ot = array_pop( $tagstack );
00476                                     wfRestoreWarnings();
00477                                 }
00478                                 if ( $t != $ot ) {
00479                                     # No match. Push the optional elements back again
00480                                     $badtag = true;
00481                                     wfSuppressWarnings();
00482                                     $ot = array_pop( $optstack );
00483                                     wfRestoreWarnings();
00484                                     while ( $ot ) {
00485                                         array_push( $tagstack, $ot );
00486                                         wfSuppressWarnings();
00487                                         $ot = array_pop( $optstack );
00488                                         wfRestoreWarnings();
00489                                     }
00490                                 }
00491                             } else {
00492                                 @array_push( $tagstack, $ot );
00493                                 # <li> can be nested in <ul> or <ol>, skip those cases:
00494                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
00495                                     $badtag = true;
00496                                 }
00497                             }
00498                         } else {
00499                             if ( $t == 'table' ) {
00500                                 $tagstack = array_pop( $tablestack );
00501                             }
00502                         }
00503                         $newparams = '';
00504                     } else {
00505                         # Keep track for later
00506                         if ( isset( $tabletags[$t] ) &&
00507                         !in_array( 'table', $tagstack ) ) {
00508                             $badtag = true;
00509                         } elseif ( in_array( $t, $tagstack ) &&
00510                         !isset( $htmlnest[$t] ) ) {
00511                             $badtag = true;
00512                         # Is it a self closed htmlpair ? (bug 5487)
00513                         } elseif ( $brace == '/>' &&
00514                         isset( $htmlpairs[$t] ) ) {
00515                             $badtag = true;
00516                         } elseif ( isset( $htmlsingleonly[$t] ) ) {
00517                             # Hack to force empty tag for unclosable elements
00518                             $brace = '/>';
00519                         } elseif ( isset( $htmlsingle[$t] ) ) {
00520                             # Hack to not close $htmlsingle tags
00521                             $brace = null;
00522                             # Still need to push this optionally-closed tag to
00523                             # the tag stack so that we can match end tags
00524                             # instead of marking them as bad.
00525                             array_push( $tagstack, $t );
00526                         } elseif ( isset( $tabletags[$t] )
00527                         && in_array( $t, $tagstack ) ) {
00528                             // New table tag but forgot to close the previous one
00529                             $text .= "</$t>";
00530                         } else {
00531                             if ( $t == 'table' ) {
00532                                 array_push( $tablestack, $tagstack );
00533                                 $tagstack = array();
00534                             }
00535                             array_push( $tagstack, $t );
00536                         }
00537 
00538                         # Replace any variables or template parameters with
00539                         # plaintext results.
00540                         if ( is_callable( $processCallback ) ) {
00541                             call_user_func_array( $processCallback, array( &$params, $args ) );
00542                         }
00543 
00544                         if ( !Sanitizer::validateTag( $params, $t ) ) {
00545                             $badtag = true;
00546                         }
00547 
00548                         # Strip non-approved attributes from the tag
00549                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
00550                     }
00551                     if ( !$badtag ) {
00552                         $rest = str_replace( '>', '&gt;', $rest );
00553                         $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00554                         $text .= "<$slash$t$newparams$close>$rest";
00555                         continue;
00556                     }
00557                 }
00558                 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
00559             }
00560             # Close off any remaining tags
00561             while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
00562                 $text .= "</$t>\n";
00563                 if ( $t == 'table' ) {
00564                     $tagstack = array_pop( $tablestack );
00565                 }
00566             }
00567         } else {
00568             # this might be possible using tidy itself
00569             foreach ( $bits as $x ) {
00570                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00571                 $x, $regs );
00572                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00573                 $badtag = false;
00574                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00575                     if ( is_callable( $processCallback ) ) {
00576                         call_user_func_array( $processCallback, array( &$params, $args ) );
00577                     }
00578 
00579                     if ( !Sanitizer::validateTag( $params, $t ) ) {
00580                         $badtag = true;
00581                     }
00582 
00583                     $newparams = Sanitizer::fixTagAttributes( $params, $t );
00584                     if ( !$badtag ) {
00585                         $rest = str_replace( '>', '&gt;', $rest );
00586                         $text .= "<$slash$t$newparams$brace$rest";
00587                         continue;
00588                     }
00589                 }
00590                 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
00591             }
00592         }
00593         wfProfileOut( __METHOD__ );
00594         return $text;
00595     }
00596 
00607     static function removeHTMLcomments( $text ) {
00608         wfProfileIn( __METHOD__ );
00609         while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
00610             $end = strpos( $text, '-->', $start + 4 );
00611             if ( $end === false ) {
00612                 # Unterminated comment; bail out
00613                 break;
00614             }
00615 
00616             $end += 3;
00617 
00618             # Trim space and newline if the comment is both
00619             # preceded and followed by a newline
00620             $spaceStart = max( $start - 1, 0 );
00621             $spaceLen = $end - $spaceStart;
00622             while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
00623                 $spaceStart--;
00624                 $spaceLen++;
00625             }
00626             while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
00627                 $spaceLen++;
00628             }
00629             if ( substr( $text, $spaceStart, 1 ) === "\n"
00630                 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
00631                 # Remove the comment, leading and trailing
00632                 # spaces, and leave only one newline.
00633                 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
00634             }
00635             else {
00636                 # Remove just the comment.
00637                 $text = substr_replace( $text, '', $start, $end - $start );
00638             }
00639         }
00640         wfProfileOut( __METHOD__ );
00641         return $text;
00642     }
00643 
00656     static function validateTag( $params, $element ) {
00657         $params = Sanitizer::decodeTagAttributes( $params );
00658 
00659         if ( $element == 'meta' || $element == 'link' ) {
00660             if ( !isset( $params['itemprop'] ) ) {
00661                 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
00662                 return false;
00663             }
00664             if ( $element == 'meta' && !isset( $params['content'] ) ) {
00665                 // <meta> must have a content="" for the itemprop
00666                 return false;
00667             }
00668             if ( $element == 'link' && !isset( $params['href'] ) ) {
00669                 // <link> must have an associated href=""
00670                 return false;
00671             }
00672         }
00673 
00674         return true;
00675     }
00676 
00692     static function validateTagAttributes( $attribs, $element ) {
00693         return Sanitizer::validateAttributes( $attribs,
00694             Sanitizer::attributeWhitelist( $element ) );
00695     }
00696 
00712     static function validateAttributes( $attribs, $whitelist ) {
00713         global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
00714 
00715         $whitelist = array_flip( $whitelist );
00716         $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
00717 
00718         $out = array();
00719         foreach ( $attribs as $attribute => $value ) {
00720             #allow XML namespace declaration if RDFa is enabled
00721             if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
00722                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00723                     $out[$attribute] = $value;
00724                 }
00725 
00726                 continue;
00727             }
00728 
00729             # Allow any attribute beginning with "data-"
00730             if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
00731                 continue;
00732             }
00733 
00734             # Strip javascript "expression" from stylesheets.
00735             # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00736             if ( $attribute == 'style' ) {
00737                 $value = Sanitizer::checkCss( $value );
00738             }
00739 
00740             if ( $attribute === 'id' ) {
00741                 $value = Sanitizer::escapeId( $value, 'noninitial' );
00742             }
00743 
00744             # WAI-ARIA
00745             # http://www.w3.org/TR/wai-aria/
00746             # http://www.whatwg.org/html/elements.html#wai-aria
00747             # For now we only support role="presentation" until we work out what roles should be
00748             # usable by content and we ensure that our code explicitly rejects patterns that
00749             # violate HTML5's ARIA restrictions.
00750             if ( $attribute === 'role' && $value !== 'presentation' ) {
00751                 continue;
00752             }
00753 
00754             // RDFa and microdata properties allow URLs, URIs and/or CURIs.
00755             // Check them for sanity.
00756             if ( $attribute === 'rel' || $attribute === 'rev'
00757                 # RDFa
00758                 || $attribute === 'about' || $attribute === 'property'
00759                 || $attribute === 'resource' || $attribute === 'datatype'
00760                 || $attribute === 'typeof'
00761                 # HTML5 microdata
00762                 || $attribute === 'itemid' || $attribute === 'itemprop'
00763                 || $attribute === 'itemref' || $attribute === 'itemscope'
00764                 || $attribute === 'itemtype'
00765             ) {
00766                 //Paranoia. Allow "simple" values but suppress javascript
00767                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00768                     continue;
00769                 }
00770             }
00771 
00772             # NOTE: even though elements using href/src are not allowed directly, supply
00773             #       validation code that can be used by tag hook handlers, etc
00774             if ( $attribute === 'href' || $attribute === 'src' ) {
00775                 if ( !preg_match( $hrefExp, $value ) ) {
00776                     continue; //drop any href or src attributes not using an allowed protocol.
00777                     // NOTE: this also drops all relative URLs
00778                 }
00779             }
00780 
00781             // If this attribute was previously set, override it.
00782             // Output should only have one attribute of each name.
00783             $out[$attribute] = $value;
00784         }
00785 
00786         if ( $wgAllowMicrodataAttributes ) {
00787             # itemtype, itemid, itemref don't make sense without itemscope
00788             if ( !array_key_exists( 'itemscope', $out ) ) {
00789                 unset( $out['itemtype'] );
00790                 unset( $out['itemid'] );
00791                 unset( $out['itemref'] );
00792             }
00793             # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
00794         }
00795         return $out;
00796     }
00797 
00808     static function mergeAttributes( $a, $b ) {
00809         $out = array_merge( $a, $b );
00810         if ( isset( $a['class'] ) && isset( $b['class'] )
00811             && is_string( $a['class'] ) && is_string( $b['class'] )
00812             && $a['class'] !== $b['class']
00813         ) {
00814             $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00815                 -1, PREG_SPLIT_NO_EMPTY );
00816             $out['class'] = implode( ' ', array_unique( $classes ) );
00817         }
00818         return $out;
00819     }
00820 
00830     public static function normalizeCss( $value ) {
00831 
00832         // Decode character references like &#123;
00833         $value = Sanitizer::decodeCharReferences( $value );
00834 
00835         // Decode escape sequences and line continuation
00836         // See the grammar in the CSS 2 spec, appendix D.
00837         // This has to be done AFTER decoding character references.
00838         // This means it isn't possible for this function to return
00839         // unsanitized escape sequences. It is possible to manufacture
00840         // input that contains character references that decode to
00841         // escape sequences that decode to character references, but
00842         // it's OK for the return value to contain character references
00843         // because the caller is supposed to escape those anyway.
00844         static $decodeRegex;
00845         if ( !$decodeRegex ) {
00846             $space = '[\\x20\\t\\r\\n\\f]';
00847             $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00848             $backslash = '\\\\';
00849             $decodeRegex = "/ $backslash
00850                 (?:
00851                     ($nl) |  # 1. Line continuation
00852                     ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
00853                     (.) | # 3. backslash cancelling special meaning
00854                     () | # 4. backslash at end of string
00855                 )/xu";
00856         }
00857         $value = preg_replace_callback( $decodeRegex,
00858             array( __CLASS__, 'cssDecodeCallback' ), $value );
00859 
00860         // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
00861         $value = preg_replace_callback(
00862             '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
00863             function ( $matches ) {
00864                 $cp = utf8ToCodepoint( $matches[0] );
00865                 if ( $cp === false ) {
00866                     return '';
00867                 }
00868                 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
00869             },
00870             $value
00871         );
00872 
00873         // Convert more characters IE6 might treat as ascii
00874         // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
00875         $value = str_replace(
00876             array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
00877             array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
00878             $value
00879         );
00880 
00881         // Let the value through if it's nothing but a single comment, to
00882         // allow other functions which may reject it to pass some error
00883         // message through.
00884         if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
00885             // Remove any comments; IE gets token splitting wrong
00886             // This must be done AFTER decoding character references and
00887             // escape sequences, because those steps can introduce comments
00888             // This step cannot introduce character references or escape
00889             // sequences, because it replaces comments with spaces rather
00890             // than removing them completely.
00891             $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00892 
00893             // Remove anything after a comment-start token, to guard against
00894             // incorrect client implementations.
00895             $commentPos = strpos( $value, '/*' );
00896             if ( $commentPos !== false ) {
00897                 $value = substr( $value, 0, $commentPos );
00898             }
00899         }
00900 
00901         // S followed by repeat, iteration, or prolonged sound marks,
00902         // which IE will treat as "ss"
00903         $value = preg_replace(
00904             '/s(?:
00905                 \xE3\x80\xB1 | # U+3031
00906                 \xE3\x82\x9D | # U+309D
00907                 \xE3\x83\xBC | # U+30FC
00908                 \xE3\x83\xBD | # U+30FD
00909                 \xEF\xB9\xBC | # U+FE7C
00910                 \xEF\xB9\xBD | # U+FE7D
00911                 \xEF\xBD\xB0   # U+FF70
00912             )/ix',
00913             'ss',
00914             $value
00915         );
00916 
00917         return $value;
00918     }
00919 
00920 
00939     static function checkCss( $value ) {
00940         $value = self::normalizeCss( $value );
00941 
00942         // Reject problematic keywords and control characters
00943         if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
00944             return '/* invalid control char */';
00945         } elseif ( preg_match(
00946             '! expression
00947                 | filter\s*:
00948                 | accelerator\s*:
00949                 | -o-link\s*:
00950                 | -o-link-source\s*:
00951                 | -o-replace\s*:
00952                 | url\s*\(
00953                 | image\s*\(
00954                 | image-set\s*\(
00955             !ix', $value ) ) {
00956             return '/* insecure input */';
00957         }
00958         return $value;
00959     }
00960 
00965     static function cssDecodeCallback( $matches ) {
00966         if ( $matches[1] !== '' ) {
00967             // Line continuation
00968             return '';
00969         } elseif ( $matches[2] !== '' ) {
00970             $char = codepointToUtf8( hexdec( $matches[2] ) );
00971         } elseif ( $matches[3] !== '' ) {
00972             $char = $matches[3];
00973         } else {
00974             $char = '\\';
00975         }
00976         if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00977             // These characters need to be escaped in strings
00978             // Clean up the escape sequence to avoid parsing errors by clients
00979             return '\\' . dechex( ord( $char ) ) . ' ';
00980         } else {
00981             // Decode unnecessary escape
00982             return $char;
00983         }
00984     }
00985 
01005     static function fixTagAttributes( $text, $element ) {
01006         if ( trim( $text ) == '' ) {
01007             return '';
01008         }
01009 
01010         $decoded = Sanitizer::decodeTagAttributes( $text );
01011         $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
01012 
01013         return Sanitizer::safeEncodeTagAttributes( $stripped );
01014     }
01015 
01021     static function encodeAttribute( $text ) {
01022         $encValue = htmlspecialchars( $text, ENT_QUOTES );
01023 
01024         // Whitespace is normalized during attribute decoding,
01025         // so if we've been passed non-spaces we must encode them
01026         // ahead of time or they won't be preserved.
01027         $encValue = strtr( $encValue, array(
01028             "\n" => '&#10;',
01029             "\r" => '&#13;',
01030             "\t" => '&#9;',
01031         ) );
01032 
01033         return $encValue;
01034     }
01035 
01042     static function safeEncodeAttribute( $text ) {
01043         $encValue = Sanitizer::encodeAttribute( $text );
01044 
01045         # Templates and links may be expanded in later parsing,
01046         # creating invalid or dangerous output. Suppress this.
01047         $encValue = strtr( $encValue, array(
01048             '<'    => '&lt;',   // This should never happen,
01049             '>'    => '&gt;',   // we've received invalid input
01050             '"'    => '&quot;', // which should have been escaped.
01051             '{'    => '&#123;',
01052             '['    => '&#91;',
01053             "''"   => '&#39;&#39;',
01054             'ISBN' => '&#73;SBN',
01055             'RFC'  => '&#82;FC',
01056             'PMID' => '&#80;MID',
01057             '|'    => '&#124;',
01058             '__'   => '&#95;_',
01059         ) );
01060 
01061         # Stupid hack
01062         $encValue = preg_replace_callback(
01063             '/((?i)' . wfUrlProtocols() . ')/',
01064             array( 'Sanitizer', 'armorLinksCallback' ),
01065             $encValue );
01066         return $encValue;
01067     }
01068 
01100     static function escapeId( $id, $options = array() ) {
01101         global $wgExperimentalHtmlIds;
01102         $options = (array)$options;
01103 
01104         if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
01105             $id = Sanitizer::decodeCharReferences( $id );
01106             $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
01107             $id = trim( $id, '_' );
01108             if ( $id === '' ) {
01109                 # Must have been all whitespace to start with.
01110                 return '_';
01111             } else {
01112                 return $id;
01113             }
01114         }
01115 
01116         # HTML4-style escaping
01117         static $replace = array(
01118             '%3A' => ':',
01119             '%' => '.'
01120         );
01121 
01122         $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
01123         $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
01124 
01125         if ( !preg_match( '/^[a-zA-Z]/', $id )
01126         && !in_array( 'noninitial', $options ) ) {
01127             // Initial character must be a letter!
01128             $id = "x$id";
01129         }
01130         return $id;
01131     }
01132 
01144     static function escapeClass( $class ) {
01145         // Convert ugly stuff to underscores and kill underscores in ugly places
01146         return rtrim( preg_replace(
01147             array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
01148             '_',
01149             $class ), '_' );
01150     }
01151 
01159     static function escapeHtmlAllowEntities( $html ) {
01160         $html = Sanitizer::decodeCharReferences( $html );
01161         # It seems wise to escape ' as well as ", as a matter of course.  Can't
01162         # hurt.
01163         $html = htmlspecialchars( $html, ENT_QUOTES );
01164         return $html;
01165     }
01166 
01172     private static function armorLinksCallback( $matches ) {
01173         return str_replace( ':', '&#58;', $matches[1] );
01174     }
01175 
01184     public static function decodeTagAttributes( $text ) {
01185         if ( trim( $text ) == '' ) {
01186             return array();
01187         }
01188 
01189         $attribs = array();
01190         $pairs = array();
01191         if ( !preg_match_all(
01192             self::getAttribsRegex(),
01193             $text,
01194             $pairs,
01195             PREG_SET_ORDER ) ) {
01196             return $attribs;
01197         }
01198 
01199         foreach ( $pairs as $set ) {
01200             $attribute = strtolower( $set[1] );
01201             $value = Sanitizer::getTagAttributeCallback( $set );
01202 
01203             // Normalize whitespace
01204             $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
01205             $value = trim( $value );
01206 
01207             // Decode character references
01208             $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
01209         }
01210         return $attribs;
01211     }
01212 
01220     public static function safeEncodeTagAttributes( $assoc_array ) {
01221         $attribs = array();
01222         foreach ( $assoc_array as $attribute => $value ) {
01223             $encAttribute = htmlspecialchars( $attribute );
01224             $encValue = Sanitizer::safeEncodeAttribute( $value );
01225 
01226             $attribs[] = "$encAttribute=\"$encValue\"";
01227         }
01228         return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
01229     }
01230 
01239     private static function getTagAttributeCallback( $set ) {
01240         if ( isset( $set[6] ) ) {
01241             # Illegal #XXXXXX color with no quotes.
01242             return $set[6];
01243         } elseif ( isset( $set[5] ) ) {
01244             # No quotes.
01245             return $set[5];
01246         } elseif ( isset( $set[4] ) ) {
01247             # Single-quoted
01248             return $set[4];
01249         } elseif ( isset( $set[3] ) ) {
01250             # Double-quoted
01251             return $set[3];
01252         } elseif ( !isset( $set[2] ) ) {
01253             # In XHTML, attributes must have a value.
01254             # For 'reduced' form, return explicitly the attribute name here.
01255             return $set[1];
01256         } else {
01257             throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
01258         }
01259     }
01260 
01272     private static function normalizeAttributeValue( $text ) {
01273         return str_replace( '"', '&quot;',
01274             self::normalizeWhitespace(
01275                 Sanitizer::normalizeCharReferences( $text ) ) );
01276     }
01277 
01282     private static function normalizeWhitespace( $text ) {
01283         return preg_replace(
01284             '/\r\n|[\x20\x0d\x0a\x09]/',
01285             ' ',
01286             $text );
01287     }
01288 
01297     static function normalizeSectionNameWhitespace( $section ) {
01298         return trim( preg_replace( '/[ _]+/', ' ', $section ) );
01299     }
01300 
01316     static function normalizeCharReferences( $text ) {
01317         return preg_replace_callback(
01318             self::CHAR_REFS_REGEX,
01319             array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01320             $text );
01321     }
01326     static function normalizeCharReferencesCallback( $matches ) {
01327         $ret = null;
01328         if ( $matches[1] != '' ) {
01329             $ret = Sanitizer::normalizeEntity( $matches[1] );
01330         } elseif ( $matches[2] != '' ) {
01331             $ret = Sanitizer::decCharReference( $matches[2] );
01332         } elseif ( $matches[3] != '' ) {
01333             $ret = Sanitizer::hexCharReference( $matches[3] );
01334         }
01335         if ( is_null( $ret ) ) {
01336             return htmlspecialchars( $matches[0] );
01337         } else {
01338             return $ret;
01339         }
01340     }
01341 
01352     static function normalizeEntity( $name ) {
01353         if ( isset( self::$htmlEntityAliases[$name] ) ) {
01354             return '&' . self::$htmlEntityAliases[$name] . ';';
01355         } elseif ( in_array( $name,
01356         array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
01357             return "&$name;";
01358         } elseif ( isset( self::$htmlEntities[$name] ) ) {
01359             return '&#' . self::$htmlEntities[$name] . ';';
01360         } else {
01361             return "&amp;$name;";
01362         }
01363     }
01364 
01369     static function decCharReference( $codepoint ) {
01370         $point = intval( $codepoint );
01371         if ( Sanitizer::validateCodepoint( $point ) ) {
01372             return sprintf( '&#%d;', $point );
01373         } else {
01374             return null;
01375         }
01376     }
01377 
01382     static function hexCharReference( $codepoint ) {
01383         $point = hexdec( $codepoint );
01384         if ( Sanitizer::validateCodepoint( $point ) ) {
01385             return sprintf( '&#x%x;', $point );
01386         } else {
01387             return null;
01388         }
01389     }
01390 
01396     private static function validateCodepoint( $codepoint ) {
01397         return $codepoint == 0x09
01398             || $codepoint == 0x0a
01399             || $codepoint == 0x0d
01400             || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
01401             || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
01402             || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
01403     }
01404 
01412     public static function decodeCharReferences( $text ) {
01413         return preg_replace_callback(
01414             self::CHAR_REFS_REGEX,
01415             array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01416             $text );
01417     }
01418 
01429     public static function decodeCharReferencesAndNormalize( $text ) {
01430         global $wgContLang;
01431         $text = preg_replace_callback(
01432             self::CHAR_REFS_REGEX,
01433             array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01434             $text, /* limit */ -1, $count );
01435 
01436         if ( $count ) {
01437             return $wgContLang->normalize( $text );
01438         } else {
01439             return $text;
01440         }
01441     }
01442 
01447     static function decodeCharReferencesCallback( $matches ) {
01448         if ( $matches[1] != '' ) {
01449             return Sanitizer::decodeEntity( $matches[1] );
01450         } elseif ( $matches[2] != '' ) {
01451             return Sanitizer::decodeChar( intval( $matches[2] ) );
01452         } elseif ( $matches[3] != '' ) {
01453             return Sanitizer::decodeChar( hexdec( $matches[3] ) );
01454         }
01455         # Last case should be an ampersand by itself
01456         return $matches[0];
01457     }
01458 
01466     static function decodeChar( $codepoint ) {
01467         if ( Sanitizer::validateCodepoint( $codepoint ) ) {
01468             return codepointToUtf8( $codepoint );
01469         } else {
01470             return UTF8_REPLACEMENT;
01471         }
01472     }
01473 
01482     static function decodeEntity( $name ) {
01483         if ( isset( self::$htmlEntityAliases[$name] ) ) {
01484             $name = self::$htmlEntityAliases[$name];
01485         }
01486         if ( isset( self::$htmlEntities[$name] ) ) {
01487             return codepointToUtf8( self::$htmlEntities[$name] );
01488         } else {
01489             return "&$name;";
01490         }
01491     }
01492 
01499     static function attributeWhitelist( $element ) {
01500         $list = Sanitizer::setupAttributeWhitelist();
01501         return isset( $list[$element] )
01502             ? $list[$element]
01503             : array();
01504     }
01505 
01511     static function setupAttributeWhitelist() {
01512         global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
01513 
01514         static $whitelist, $staticInitialised;
01515         $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
01516 
01517         if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
01518             return $whitelist;
01519         }
01520 
01521         $common = array(
01522             # HTML
01523             'id',
01524             'class',
01525             'style',
01526             'lang',
01527             'dir',
01528             'title',
01529 
01530             # WAI-ARIA
01531             'role',
01532         );
01533 
01534         if ( $wgAllowRdfaAttributes ) {
01535             # RDFa attributes as specified in section 9 of
01536             # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
01537             $common = array_merge( $common, array(
01538                 'about', 'property', 'resource', 'datatype', 'typeof',
01539             ) );
01540         }
01541 
01542         if ( $wgAllowMicrodataAttributes ) {
01543             # add HTML5 microdata tags as specified by
01544             # http://www.whatwg.org/html/microdata.html#the-microdata-model
01545             $common = array_merge( $common, array(
01546                 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
01547             ) );
01548         }
01549 
01550         $block = array_merge( $common, array( 'align' ) );
01551         $tablealign = array( 'align', 'char', 'charoff', 'valign' );
01552         $tablecell = array(
01553             'abbr',
01554             'axis',
01555             'headers',
01556             'scope',
01557             'rowspan',
01558             'colspan',
01559             'nowrap', # deprecated
01560             'width', # deprecated
01561             'height', # deprecated
01562             'bgcolor', # deprecated
01563         );
01564 
01565         # Numbers refer to sections in HTML 4.01 standard describing the element.
01566         # See: http://www.w3.org/TR/html4/
01567         $whitelist = array(
01568             # 7.5.4
01569             'div'        => $block,
01570             'center'     => $common, # deprecated
01571             'span'       => $block, # ??
01572 
01573             # 7.5.5
01574             'h1'         => $block,
01575             'h2'         => $block,
01576             'h3'         => $block,
01577             'h4'         => $block,
01578             'h5'         => $block,
01579             'h6'         => $block,
01580 
01581             # 7.5.6
01582             # address
01583 
01584             # 8.2.4
01585             # bdo
01586 
01587             # 9.2.1
01588             'em'         => $common,
01589             'strong'     => $common,
01590             'cite'       => $common,
01591             'dfn'        => $common,
01592             'code'       => $common,
01593             'samp'       => $common,
01594             'kbd'        => $common,
01595             'var'        => $common,
01596             'abbr'       => $common,
01597             # acronym
01598 
01599             # 9.2.2
01600             'blockquote' => array_merge( $common, array( 'cite' ) ),
01601             # q
01602 
01603             # 9.2.3
01604             'sub'        => $common,
01605             'sup'        => $common,
01606 
01607             # 9.3.1
01608             'p'          => $block,
01609 
01610             # 9.3.2
01611             'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
01612 
01613             # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
01614             'wbr'        => array( 'id', 'class', 'title', 'style' ),
01615 
01616             # 9.3.4
01617             'pre'        => array_merge( $common, array( 'width' ) ),
01618 
01619             # 9.4
01620             'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01621             'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01622 
01623             # 10.2
01624             'ul'         => array_merge( $common, array( 'type' ) ),
01625             'ol'         => array_merge( $common, array( 'type', 'start' ) ),
01626             'li'         => array_merge( $common, array( 'type', 'value' ) ),
01627 
01628             # 10.3
01629             'dl'         => $common,
01630             'dd'         => $common,
01631             'dt'         => $common,
01632 
01633             # 11.2.1
01634             'table'      => array_merge( $common,
01635                                 array( 'summary', 'width', 'border', 'frame',
01636                                         'rules', 'cellspacing', 'cellpadding',
01637                                         'align', 'bgcolor',
01638                                 ) ),
01639 
01640             # 11.2.2
01641             'caption'    => array_merge( $common, array( 'align' ) ),
01642 
01643             # 11.2.3
01644             'thead'      => array_merge( $common, $tablealign ),
01645             'tfoot'      => array_merge( $common, $tablealign ),
01646             'tbody'      => array_merge( $common, $tablealign ),
01647 
01648             # 11.2.4
01649             'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01650             'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01651 
01652             # 11.2.5
01653             'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01654 
01655             # 11.2.6
01656             'td'         => array_merge( $common, $tablecell, $tablealign ),
01657             'th'         => array_merge( $common, $tablecell, $tablealign ),
01658 
01659             # 12.2
01660             # NOTE: <a> is not allowed directly, but the attrib
01661             # whitelist is used from the Parser object
01662             'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
01663 
01664             # 13.2
01665             # Not usually allowed, but may be used for extension-style hooks
01666             # such as <math> when it is rasterized, or if $wgAllowImageTag is
01667             # true
01668             'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
01669 
01670             # 15.2.1
01671             'tt'         => $common,
01672             'b'          => $common,
01673             'i'          => $common,
01674             'big'        => $common,
01675             'small'      => $common,
01676             'strike'     => $common,
01677             's'          => $common,
01678             'u'          => $common,
01679 
01680             # 15.2.2
01681             'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
01682             # basefont
01683 
01684             # 15.3
01685             'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
01686 
01687             # HTML Ruby annotation text module, simple ruby only.
01688             # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
01689             'ruby'       => $common,
01690             # rbc
01691             # rtc
01692             'rb'         => $common,
01693             'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
01694             'rp'         => $common,
01695 
01696             # MathML root element, where used for extensions
01697             # 'title' may not be 100% valid here; it's XHTML
01698             # http://www.w3.org/TR/REC-MathML/
01699             'math'       => array( 'class', 'style', 'id', 'title' ),
01700 
01701             # HTML 5 section 4.6
01702             'bdi' => $common,
01703 
01704             # HTML5 elements, defined by:
01705             # http://www.whatwg.org/html/
01706             'data' => array_merge( $common, array( 'value' ) ),
01707             'time' => array_merge( $common, array( 'datetime' ) ),
01708             'mark' => $common,
01709 
01710             // meta and link are only permitted by removeHTMLtags when Microdata
01711             // is enabled so we don't bother adding a conditional to hide these
01712             // Also meta and link are only valid in WikiText as Microdata elements
01713             // (ie: validateTag rejects tags missing the attributes needed for Microdata)
01714             // So we don't bother including $common attributes that have no purpose.
01715             'meta' => array( 'itemprop', 'content' ),
01716             'link' => array( 'itemprop', 'href' ),
01717         );
01718 
01719         $staticInitialised = $globalContext;
01720 
01721         return $whitelist;
01722     }
01723 
01734     static function stripAllTags( $text ) {
01735         # Actual <tags>
01736         $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01737 
01738         # Normalize &entities and whitespace
01739         $text = self::decodeCharReferences( $text );
01740         $text = self::normalizeWhitespace( $text );
01741 
01742         return $text;
01743     }
01744 
01754     static function hackDocType() {
01755         $out = "<!DOCTYPE html [\n";
01756         foreach ( self::$htmlEntities as $entity => $codepoint ) {
01757             $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01758         }
01759         $out .= "]>\n";
01760         return $out;
01761     }
01762 
01767     static function cleanUrl( $url ) {
01768         # Normalize any HTML entities in input. They will be
01769         # re-escaped by makeExternalLink().
01770         $url = Sanitizer::decodeCharReferences( $url );
01771 
01772         # Escape any control characters introduced by the above step
01773         $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
01774             array( __CLASS__, 'cleanUrlCallback' ), $url );
01775 
01776         # Validate hostname portion
01777         $matches = array();
01778         if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01779             list( /* $whole */, $protocol, $host, $rest ) = $matches;
01780 
01781             // Characters that will be ignored in IDNs.
01782             // http://tools.ietf.org/html/3454#section-3.1
01783             // Strip them before further processing so blacklists and such work.
01784             $strip = "/
01785                 \\s|          # general whitespace
01786                 \xc2\xad|     # 00ad SOFT HYPHEN
01787                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01788                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01789                 \xe2\x81\xa0| # 2060 WORD JOINER
01790                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01791                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
01792                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01793                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01794                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01795                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01796                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01797                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
01798                 /xuD";
01799 
01800             $host = preg_replace( $strip, '', $host );
01801 
01802             // @todo FIXME: Validate hostnames here
01803 
01804             return $protocol . $host . $rest;
01805         } else {
01806             return $url;
01807         }
01808     }
01809 
01814     static function cleanUrlCallback( $matches ) {
01815         return urlencode( $matches[0] );
01816     }
01817 
01846     public static function validateEmail( $addr ) {
01847         $result = null;
01848         if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
01849             return $result;
01850         }
01851 
01852         // Please note strings below are enclosed in brackets [], this make the
01853         // hyphen "-" a range indicator. Hence it is double backslashed below.
01854         // See bug 26948
01855         $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
01856         $rfc1034_ldh_str = "a-z0-9\\-";
01857 
01858         $HTML5_email_regexp = "/
01859         ^                      # start of string
01860         [$rfc5322_atext\\.]+    # user part which is liberal :p
01861         @                      # 'apostrophe'
01862         [$rfc1034_ldh_str]+       # First domain part
01863         (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
01864         $                      # End of string
01865         /ix"; // case Insensitive, eXtended
01866 
01867         return (bool)preg_match( $HTML5_email_regexp, $addr );
01868     }
01869 }