MediaWiki  REL1_23
Sanitizer.php
Go to the documentation of this file.
00001 <?php
00031 class Sanitizer {
00036     const CHAR_REFS_REGEX =
00037         '/&([A-Za-z0-9\x80-\xff]+);
00038          |&\#([0-9]+);
00039          |&\#[xX]([0-9A-Fa-f]+);
00040          |(&)/x';
00041 
00050     const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
00051     const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
00052 
00058     private static $htmlEntities = array(
00059         'Aacute'   => 193,
00060         'aacute'   => 225,
00061         'Acirc'    => 194,
00062         'acirc'    => 226,
00063         'acute'    => 180,
00064         'AElig'    => 198,
00065         'aelig'    => 230,
00066         'Agrave'   => 192,
00067         'agrave'   => 224,
00068         'alefsym'  => 8501,
00069         'Alpha'    => 913,
00070         'alpha'    => 945,
00071         'amp'      => 38,
00072         'and'      => 8743,
00073         'ang'      => 8736,
00074         'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
00075         'Aring'    => 197,
00076         'aring'    => 229,
00077         'asymp'    => 8776,
00078         'Atilde'   => 195,
00079         'atilde'   => 227,
00080         'Auml'     => 196,
00081         'auml'     => 228,
00082         'bdquo'    => 8222,
00083         'Beta'     => 914,
00084         'beta'     => 946,
00085         'brvbar'   => 166,
00086         'bull'     => 8226,
00087         'cap'      => 8745,
00088         'Ccedil'   => 199,
00089         'ccedil'   => 231,
00090         'cedil'    => 184,
00091         'cent'     => 162,
00092         'Chi'      => 935,
00093         'chi'      => 967,
00094         'circ'     => 710,
00095         'clubs'    => 9827,
00096         'cong'     => 8773,
00097         'copy'     => 169,
00098         'crarr'    => 8629,
00099         'cup'      => 8746,
00100         'curren'   => 164,
00101         'dagger'   => 8224,
00102         'Dagger'   => 8225,
00103         'darr'     => 8595,
00104         'dArr'     => 8659,
00105         'deg'      => 176,
00106         'Delta'    => 916,
00107         'delta'    => 948,
00108         'diams'    => 9830,
00109         'divide'   => 247,
00110         'Eacute'   => 201,
00111         'eacute'   => 233,
00112         'Ecirc'    => 202,
00113         'ecirc'    => 234,
00114         'Egrave'   => 200,
00115         'egrave'   => 232,
00116         'empty'    => 8709,
00117         'emsp'     => 8195,
00118         'ensp'     => 8194,
00119         'Epsilon'  => 917,
00120         'epsilon'  => 949,
00121         'equiv'    => 8801,
00122         'Eta'      => 919,
00123         'eta'      => 951,
00124         'ETH'      => 208,
00125         'eth'      => 240,
00126         'Euml'     => 203,
00127         'euml'     => 235,
00128         'euro'     => 8364,
00129         'exist'    => 8707,
00130         'fnof'     => 402,
00131         'forall'   => 8704,
00132         'frac12'   => 189,
00133         'frac14'   => 188,
00134         'frac34'   => 190,
00135         'frasl'    => 8260,
00136         'Gamma'    => 915,
00137         'gamma'    => 947,
00138         'ge'       => 8805,
00139         'gt'       => 62,
00140         'harr'     => 8596,
00141         'hArr'     => 8660,
00142         'hearts'   => 9829,
00143         'hellip'   => 8230,
00144         'Iacute'   => 205,
00145         'iacute'   => 237,
00146         'Icirc'    => 206,
00147         'icirc'    => 238,
00148         'iexcl'    => 161,
00149         'Igrave'   => 204,
00150         'igrave'   => 236,
00151         'image'    => 8465,
00152         'infin'    => 8734,
00153         'int'      => 8747,
00154         'Iota'     => 921,
00155         'iota'     => 953,
00156         'iquest'   => 191,
00157         'isin'     => 8712,
00158         'Iuml'     => 207,
00159         'iuml'     => 239,
00160         'Kappa'    => 922,
00161         'kappa'    => 954,
00162         'Lambda'   => 923,
00163         'lambda'   => 955,
00164         'lang'     => 9001,
00165         'laquo'    => 171,
00166         'larr'     => 8592,
00167         'lArr'     => 8656,
00168         'lceil'    => 8968,
00169         'ldquo'    => 8220,
00170         'le'       => 8804,
00171         'lfloor'   => 8970,
00172         'lowast'   => 8727,
00173         'loz'      => 9674,
00174         'lrm'      => 8206,
00175         'lsaquo'   => 8249,
00176         'lsquo'    => 8216,
00177         'lt'       => 60,
00178         'macr'     => 175,
00179         'mdash'    => 8212,
00180         'micro'    => 181,
00181         'middot'   => 183,
00182         'minus'    => 8722,
00183         'Mu'       => 924,
00184         'mu'       => 956,
00185         'nabla'    => 8711,
00186         'nbsp'     => 160,
00187         'ndash'    => 8211,
00188         'ne'       => 8800,
00189         'ni'       => 8715,
00190         'not'      => 172,
00191         'notin'    => 8713,
00192         'nsub'     => 8836,
00193         'Ntilde'   => 209,
00194         'ntilde'   => 241,
00195         'Nu'       => 925,
00196         'nu'       => 957,
00197         'Oacute'   => 211,
00198         'oacute'   => 243,
00199         'Ocirc'    => 212,
00200         'ocirc'    => 244,
00201         'OElig'    => 338,
00202         'oelig'    => 339,
00203         'Ograve'   => 210,
00204         'ograve'   => 242,
00205         'oline'    => 8254,
00206         'Omega'    => 937,
00207         'omega'    => 969,
00208         'Omicron'  => 927,
00209         'omicron'  => 959,
00210         'oplus'    => 8853,
00211         'or'       => 8744,
00212         'ordf'     => 170,
00213         'ordm'     => 186,
00214         'Oslash'   => 216,
00215         'oslash'   => 248,
00216         'Otilde'   => 213,
00217         'otilde'   => 245,
00218         'otimes'   => 8855,
00219         'Ouml'     => 214,
00220         'ouml'     => 246,
00221         'para'     => 182,
00222         'part'     => 8706,
00223         'permil'   => 8240,
00224         'perp'     => 8869,
00225         'Phi'      => 934,
00226         'phi'      => 966,
00227         'Pi'       => 928,
00228         'pi'       => 960,
00229         'piv'      => 982,
00230         'plusmn'   => 177,
00231         'pound'    => 163,
00232         'prime'    => 8242,
00233         'Prime'    => 8243,
00234         'prod'     => 8719,
00235         'prop'     => 8733,
00236         'Psi'      => 936,
00237         'psi'      => 968,
00238         'quot'     => 34,
00239         'radic'    => 8730,
00240         'rang'     => 9002,
00241         'raquo'    => 187,
00242         'rarr'     => 8594,
00243         'rArr'     => 8658,
00244         'rceil'    => 8969,
00245         'rdquo'    => 8221,
00246         'real'     => 8476,
00247         'reg'      => 174,
00248         'rfloor'   => 8971,
00249         'Rho'      => 929,
00250         'rho'      => 961,
00251         'rlm'      => 8207,
00252         'rsaquo'   => 8250,
00253         'rsquo'    => 8217,
00254         'sbquo'    => 8218,
00255         'Scaron'   => 352,
00256         'scaron'   => 353,
00257         'sdot'     => 8901,
00258         'sect'     => 167,
00259         'shy'      => 173,
00260         'Sigma'    => 931,
00261         'sigma'    => 963,
00262         'sigmaf'   => 962,
00263         'sim'      => 8764,
00264         'spades'   => 9824,
00265         'sub'      => 8834,
00266         'sube'     => 8838,
00267         'sum'      => 8721,
00268         'sup'      => 8835,
00269         'sup1'     => 185,
00270         'sup2'     => 178,
00271         'sup3'     => 179,
00272         'supe'     => 8839,
00273         'szlig'    => 223,
00274         'Tau'      => 932,
00275         'tau'      => 964,
00276         'there4'   => 8756,
00277         'Theta'    => 920,
00278         'theta'    => 952,
00279         'thetasym' => 977,
00280         'thinsp'   => 8201,
00281         'THORN'    => 222,
00282         'thorn'    => 254,
00283         'tilde'    => 732,
00284         'times'    => 215,
00285         'trade'    => 8482,
00286         'Uacute'   => 218,
00287         'uacute'   => 250,
00288         'uarr'     => 8593,
00289         'uArr'     => 8657,
00290         'Ucirc'    => 219,
00291         'ucirc'    => 251,
00292         'Ugrave'   => 217,
00293         'ugrave'   => 249,
00294         'uml'      => 168,
00295         'upsih'    => 978,
00296         'Upsilon'  => 933,
00297         'upsilon'  => 965,
00298         'Uuml'     => 220,
00299         'uuml'     => 252,
00300         'weierp'   => 8472,
00301         'Xi'       => 926,
00302         'xi'       => 958,
00303         'Yacute'   => 221,
00304         'yacute'   => 253,
00305         'yen'      => 165,
00306         'Yuml'     => 376,
00307         'yuml'     => 255,
00308         'Zeta'     => 918,
00309         'zeta'     => 950,
00310         'zwj'      => 8205,
00311         'zwnj'     => 8204
00312     );
00313 
00317     private static $htmlEntityAliases = array(
00318         'רלמ' => 'rlm',
00319         'رلم' => 'rlm',
00320     );
00321 
00325     private static $attribsRegex;
00326 
00332     static function getAttribsRegex() {
00333         if ( self::$attribsRegex === null ) {
00334             $attribFirst = '[:A-Z_a-z0-9]';
00335             $attrib = '[:A-Z_a-z-.0-9]';
00336             $space = '[\x09\x0a\x0d\x20]';
00337             self::$attribsRegex =
00338                 "/(?:^|$space)({$attribFirst}{$attrib}*)
00339                   ($space*=$space*
00340                     (?:
00341                      # The attribute value: quoted or alone
00342                       \"([^<\"]*)\"
00343                      | '([^<']*)'
00344                      |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00345                      |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00346                                          # colors are specified like this.
00347                                          # We'll be normalizing it.
00348                     )
00349                 )?(?=$space|\$)/sx";
00350         }
00351         return self::$attribsRegex;
00352     }
00353 
00366     static function removeHTMLtags( $text, $processCallback = null,
00367         $args = array(), $extratags = array(), $removetags = array()
00368     ) {
00369         global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
00370 
00371         static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00372             $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
00373 
00374         wfProfileIn( __METHOD__ );
00375 
00376         // Base our staticInitialised variable off of the global config state so that if the globals
00377         // are changed (like in the screwed up test system) we will re-initialise the settings.
00378         $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
00379         if ( !$staticInitialised || $staticInitialised != $globalContext ) {
00380 
00381             $htmlpairsStatic = array( # Tags that must be closed
00382                 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00383                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00384                 'strike', 'strong', 'tt', 'var', 'div', 'center',
00385                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00386                 'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
00387                 'kbd', 'samp', 'data', 'time', 'mark'
00388             );
00389             $htmlsingle = array(
00390                 'br', 'wbr', 'hr', 'li', 'dt', 'dd'
00391             );
00392             $htmlsingleonly = array( # Elements that cannot have close tags
00393                 'br', 'wbr', 'hr'
00394             );
00395             if ( $wgAllowMicrodataAttributes ) {
00396                 $htmlsingle[] = $htmlsingleonly[] = 'meta';
00397                 $htmlsingle[] = $htmlsingleonly[] = 'link';
00398             }
00399             $htmlnest = array( # Tags that can be nested--??
00400                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00401                 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
00402                 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
00403             );
00404             $tabletags = array( # Can only appear inside table, we will close them
00405                 'td', 'th', 'tr',
00406             );
00407             $htmllist = array( # Tags used by list
00408                 'ul', 'ol',
00409             );
00410             $listtags = array( # Tags that can appear in a list
00411                 'li',
00412             );
00413 
00414             if ( $wgAllowImageTag ) {
00415                 $htmlsingle[] = 'img';
00416                 $htmlsingleonly[] = 'img';
00417             }
00418 
00419             $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
00420             $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
00421 
00422             # Convert them all to hashtables for faster lookup
00423             $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00424                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
00425             foreach ( $vars as $var ) {
00426                 $$var = array_flip( $$var );
00427             }
00428             $staticInitialised = $globalContext;
00429         }
00430         # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
00431         $extratags = array_flip( $extratags );
00432         $removetags = array_flip( $removetags );
00433         $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
00434         $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
00435 
00436         # Remove HTML comments
00437         $text = Sanitizer::removeHTMLcomments( $text );
00438         $bits = explode( '<', $text );
00439         $text = str_replace( '>', '&gt;', array_shift( $bits ) );
00440         if ( !$wgUseTidy ) {
00441             $tagstack = $tablestack = array();
00442             foreach ( $bits as $x ) {
00443                 $regs = array();
00444                 # $slash: Does the current element start with a '/'?
00445                 # $t: Current element name
00446                 # $params: String between element name and >
00447                 # $brace: Ending '>' or '/>'
00448                 # $rest: Everything until the next element of $bits
00449                 if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00450                     list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00451                 } else {
00452                     $slash = $t = $params = $brace = $rest = null;
00453                 }
00454 
00455                 $badtag = false;
00456                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00457                     # Check our stack
00458                     if ( $slash && isset( $htmlsingleonly[$t] ) ) {
00459                         $badtag = true;
00460                     } elseif ( $slash ) {
00461                         # Closing a tag... is it the one we just opened?
00462                         $ot = @array_pop( $tagstack );
00463                         if ( $ot != $t ) {
00464                             if ( isset( $htmlsingleallowed[$ot] ) ) {
00465                                 # Pop all elements with an optional close tag
00466                                 # and see if we find a match below them
00467                                 $optstack = array();
00468                                 array_push( $optstack, $ot );
00469                                 wfSuppressWarnings();
00470                                 $ot = array_pop( $tagstack );
00471                                 wfRestoreWarnings();
00472                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
00473                                     array_push( $optstack, $ot );
00474                                     wfSuppressWarnings();
00475                                     $ot = array_pop( $tagstack );
00476                                     wfRestoreWarnings();
00477                                 }
00478                                 if ( $t != $ot ) {
00479                                     # No match. Push the optional elements back again
00480                                     $badtag = true;
00481                                     wfSuppressWarnings();
00482                                     $ot = array_pop( $optstack );
00483                                     wfRestoreWarnings();
00484                                     while ( $ot ) {
00485                                         array_push( $tagstack, $ot );
00486                                         wfSuppressWarnings();
00487                                         $ot = array_pop( $optstack );
00488                                         wfRestoreWarnings();
00489                                     }
00490                                 }
00491                             } else {
00492                                 @array_push( $tagstack, $ot );
00493                                 # <li> can be nested in <ul> or <ol>, skip those cases:
00494                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
00495                                     $badtag = true;
00496                                 }
00497                             }
00498                         } else {
00499                             if ( $t == 'table' ) {
00500                                 $tagstack = array_pop( $tablestack );
00501                             }
00502                         }
00503                         $newparams = '';
00504                     } else {
00505                         # Keep track for later
00506                         if ( isset( $tabletags[$t] ) &&
00507                         !in_array( 'table', $tagstack ) ) {
00508                             $badtag = true;
00509                         } elseif ( in_array( $t, $tagstack ) &&
00510                         !isset( $htmlnest[$t] ) ) {
00511                             $badtag = true;
00512                         # Is it a self closed htmlpair ? (bug 5487)
00513                         } elseif ( $brace == '/>' &&
00514                         isset( $htmlpairs[$t] ) ) {
00515                             $badtag = true;
00516                         } elseif ( isset( $htmlsingleonly[$t] ) ) {
00517                             # Hack to force empty tag for unclosable elements
00518                             $brace = '/>';
00519                         } elseif ( isset( $htmlsingle[$t] ) ) {
00520                             # Hack to not close $htmlsingle tags
00521                             $brace = null;
00522                             # Still need to push this optionally-closed tag to
00523                             # the tag stack so that we can match end tags
00524                             # instead of marking them as bad.
00525                             array_push( $tagstack, $t );
00526                         } elseif ( isset( $tabletags[$t] )
00527                         && in_array( $t, $tagstack ) ) {
00528                             // New table tag but forgot to close the previous one
00529                             $text .= "</$t>";
00530                         } else {
00531                             if ( $t == 'table' ) {
00532                                 array_push( $tablestack, $tagstack );
00533                                 $tagstack = array();
00534                             }
00535                             array_push( $tagstack, $t );
00536                         }
00537 
00538                         # Replace any variables or template parameters with
00539                         # plaintext results.
00540                         if ( is_callable( $processCallback ) ) {
00541                             call_user_func_array( $processCallback, array( &$params, $args ) );
00542                         }
00543 
00544                         if ( !Sanitizer::validateTag( $params, $t ) ) {
00545                             $badtag = true;
00546                         }
00547 
00548                         # Strip non-approved attributes from the tag
00549                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
00550                     }
00551                     if ( !$badtag ) {
00552                         $rest = str_replace( '>', '&gt;', $rest );
00553                         $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00554                         $text .= "<$slash$t$newparams$close>$rest";
00555                         continue;
00556                     }
00557                 }
00558                 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
00559             }
00560             # Close off any remaining tags
00561             while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
00562                 $text .= "</$t>\n";
00563                 if ( $t == 'table' ) {
00564                     $tagstack = array_pop( $tablestack );
00565                 }
00566             }
00567         } else {
00568             # this might be possible using tidy itself
00569             foreach ( $bits as $x ) {
00570                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00571                 $x, $regs );
00572                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00573                 $badtag = false;
00574                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00575                     if ( is_callable( $processCallback ) ) {
00576                         call_user_func_array( $processCallback, array( &$params, $args ) );
00577                     }
00578 
00579                     if ( !Sanitizer::validateTag( $params, $t ) ) {
00580                         $badtag = true;
00581                     }
00582 
00583                     $newparams = Sanitizer::fixTagAttributes( $params, $t );
00584                     if ( !$badtag ) {
00585                         $rest = str_replace( '>', '&gt;', $rest );
00586                         $text .= "<$slash$t$newparams$brace$rest";
00587                         continue;
00588                     }
00589                 }
00590                 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
00591             }
00592         }
00593         wfProfileOut( __METHOD__ );
00594         return $text;
00595     }
00596 
00607     static function removeHTMLcomments( $text ) {
00608         wfProfileIn( __METHOD__ );
00609         while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
00610             $end = strpos( $text, '-->', $start + 4 );
00611             if ( $end === false ) {
00612                 # Unterminated comment; bail out
00613                 break;
00614             }
00615 
00616             $end += 3;
00617 
00618             # Trim space and newline if the comment is both
00619             # preceded and followed by a newline
00620             $spaceStart = max( $start - 1, 0 );
00621             $spaceLen = $end - $spaceStart;
00622             while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
00623                 $spaceStart--;
00624                 $spaceLen++;
00625             }
00626             while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
00627                 $spaceLen++;
00628             }
00629             if ( substr( $text, $spaceStart, 1 ) === "\n"
00630                 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
00631                 # Remove the comment, leading and trailing
00632                 # spaces, and leave only one newline.
00633                 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
00634             } else {
00635                 # Remove just the comment.
00636                 $text = substr_replace( $text, '', $start, $end - $start );
00637             }
00638         }
00639         wfProfileOut( __METHOD__ );
00640         return $text;
00641     }
00642 
00655     static function validateTag( $params, $element ) {
00656         $params = Sanitizer::decodeTagAttributes( $params );
00657 
00658         if ( $element == 'meta' || $element == 'link' ) {
00659             if ( !isset( $params['itemprop'] ) ) {
00660                 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
00661                 return false;
00662             }
00663             if ( $element == 'meta' && !isset( $params['content'] ) ) {
00664                 // <meta> must have a content="" for the itemprop
00665                 return false;
00666             }
00667             if ( $element == 'link' && !isset( $params['href'] ) ) {
00668                 // <link> must have an associated href=""
00669                 return false;
00670             }
00671         }
00672 
00673         return true;
00674     }
00675 
00691     static function validateTagAttributes( $attribs, $element ) {
00692         return Sanitizer::validateAttributes( $attribs,
00693             Sanitizer::attributeWhitelist( $element ) );
00694     }
00695 
00711     static function validateAttributes( $attribs, $whitelist ) {
00712         global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
00713 
00714         $whitelist = array_flip( $whitelist );
00715         $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
00716 
00717         $out = array();
00718         foreach ( $attribs as $attribute => $value ) {
00719             #allow XML namespace declaration if RDFa is enabled
00720             if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
00721                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00722                     $out[$attribute] = $value;
00723                 }
00724 
00725                 continue;
00726             }
00727 
00728             # Allow any attribute beginning with "data-"
00729             if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
00730                 continue;
00731             }
00732 
00733             # Strip javascript "expression" from stylesheets.
00734             # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00735             if ( $attribute == 'style' ) {
00736                 $value = Sanitizer::checkCss( $value );
00737             }
00738 
00739             if ( $attribute === 'id' ) {
00740                 $value = Sanitizer::escapeId( $value, 'noninitial' );
00741             }
00742 
00743             # WAI-ARIA
00744             # http://www.w3.org/TR/wai-aria/
00745             # http://www.whatwg.org/html/elements.html#wai-aria
00746             # For now we only support role="presentation" until we work out what roles should be
00747             # usable by content and we ensure that our code explicitly rejects patterns that
00748             # violate HTML5's ARIA restrictions.
00749             if ( $attribute === 'role' && $value !== 'presentation' ) {
00750                 continue;
00751             }
00752 
00753             // RDFa and microdata properties allow URLs, URIs and/or CURIs.
00754             // Check them for sanity.
00755             if ( $attribute === 'rel' || $attribute === 'rev'
00756                 # RDFa
00757                 || $attribute === 'about' || $attribute === 'property'
00758                 || $attribute === 'resource' || $attribute === 'datatype'
00759                 || $attribute === 'typeof'
00760                 # HTML5 microdata
00761                 || $attribute === 'itemid' || $attribute === 'itemprop'
00762                 || $attribute === 'itemref' || $attribute === 'itemscope'
00763                 || $attribute === 'itemtype'
00764             ) {
00765                 //Paranoia. Allow "simple" values but suppress javascript
00766                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00767                     continue;
00768                 }
00769             }
00770 
00771             # NOTE: even though elements using href/src are not allowed directly, supply
00772             #       validation code that can be used by tag hook handlers, etc
00773             if ( $attribute === 'href' || $attribute === 'src' ) {
00774                 if ( !preg_match( $hrefExp, $value ) ) {
00775                     continue; //drop any href or src attributes not using an allowed protocol.
00776                     // NOTE: this also drops all relative URLs
00777                 }
00778             }
00779 
00780             // If this attribute was previously set, override it.
00781             // Output should only have one attribute of each name.
00782             $out[$attribute] = $value;
00783         }
00784 
00785         if ( $wgAllowMicrodataAttributes ) {
00786             # itemtype, itemid, itemref don't make sense without itemscope
00787             if ( !array_key_exists( 'itemscope', $out ) ) {
00788                 unset( $out['itemtype'] );
00789                 unset( $out['itemid'] );
00790                 unset( $out['itemref'] );
00791             }
00792             # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
00793         }
00794         return $out;
00795     }
00796 
00807     static function mergeAttributes( $a, $b ) {
00808         $out = array_merge( $a, $b );
00809         if ( isset( $a['class'] ) && isset( $b['class'] )
00810             && is_string( $a['class'] ) && is_string( $b['class'] )
00811             && $a['class'] !== $b['class']
00812         ) {
00813             $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00814                 -1, PREG_SPLIT_NO_EMPTY );
00815             $out['class'] = implode( ' ', array_unique( $classes ) );
00816         }
00817         return $out;
00818     }
00819 
00829     public static function normalizeCss( $value ) {
00830 
00831         // Decode character references like &#123;
00832         $value = Sanitizer::decodeCharReferences( $value );
00833 
00834         // Decode escape sequences and line continuation
00835         // See the grammar in the CSS 2 spec, appendix D.
00836         // This has to be done AFTER decoding character references.
00837         // This means it isn't possible for this function to return
00838         // unsanitized escape sequences. It is possible to manufacture
00839         // input that contains character references that decode to
00840         // escape sequences that decode to character references, but
00841         // it's OK for the return value to contain character references
00842         // because the caller is supposed to escape those anyway.
00843         static $decodeRegex;
00844         if ( !$decodeRegex ) {
00845             $space = '[\\x20\\t\\r\\n\\f]';
00846             $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00847             $backslash = '\\\\';
00848             $decodeRegex = "/ $backslash
00849                 (?:
00850                     ($nl) |  # 1. Line continuation
00851                     ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
00852                     (.) | # 3. backslash cancelling special meaning
00853                     () | # 4. backslash at end of string
00854                 )/xu";
00855         }
00856         $value = preg_replace_callback( $decodeRegex,
00857             array( __CLASS__, 'cssDecodeCallback' ), $value );
00858 
00859         // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
00860         $value = preg_replace_callback(
00861             '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
00862             function ( $matches ) {
00863                 $cp = utf8ToCodepoint( $matches[0] );
00864                 if ( $cp === false ) {
00865                     return '';
00866                 }
00867                 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
00868             },
00869             $value
00870         );
00871 
00872         // Convert more characters IE6 might treat as ascii
00873         // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
00874         $value = str_replace(
00875             array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
00876             array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
00877             $value
00878         );
00879 
00880         // Let the value through if it's nothing but a single comment, to
00881         // allow other functions which may reject it to pass some error
00882         // message through.
00883         if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
00884             // Remove any comments; IE gets token splitting wrong
00885             // This must be done AFTER decoding character references and
00886             // escape sequences, because those steps can introduce comments
00887             // This step cannot introduce character references or escape
00888             // sequences, because it replaces comments with spaces rather
00889             // than removing them completely.
00890             $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00891 
00892             // Remove anything after a comment-start token, to guard against
00893             // incorrect client implementations.
00894             $commentPos = strpos( $value, '/*' );
00895             if ( $commentPos !== false ) {
00896                 $value = substr( $value, 0, $commentPos );
00897             }
00898         }
00899 
00900         // S followed by repeat, iteration, or prolonged sound marks,
00901         // which IE will treat as "ss"
00902         $value = preg_replace(
00903             '/s(?:
00904                 \xE3\x80\xB1 | # U+3031
00905                 \xE3\x82\x9D | # U+309D
00906                 \xE3\x83\xBC | # U+30FC
00907                 \xE3\x83\xBD | # U+30FD
00908                 \xEF\xB9\xBC | # U+FE7C
00909                 \xEF\xB9\xBD | # U+FE7D
00910                 \xEF\xBD\xB0   # U+FF70
00911             )/ix',
00912             'ss',
00913             $value
00914         );
00915 
00916         return $value;
00917     }
00918 
00919 
00938     static function checkCss( $value ) {
00939         $value = self::normalizeCss( $value );
00940 
00941         // Reject problematic keywords and control characters
00942         if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
00943             return '/* invalid control char */';
00944         } elseif ( preg_match(
00945             '! expression
00946                 | filter\s*:
00947                 | accelerator\s*:
00948                 | -o-link\s*:
00949                 | -o-link-source\s*:
00950                 | -o-replace\s*:
00951                 | url\s*\(
00952                 | image\s*\(
00953                 | image-set\s*\(
00954             !ix', $value ) ) {
00955             return '/* insecure input */';
00956         }
00957         return $value;
00958     }
00959 
00964     static function cssDecodeCallback( $matches ) {
00965         if ( $matches[1] !== '' ) {
00966             // Line continuation
00967             return '';
00968         } elseif ( $matches[2] !== '' ) {
00969             $char = codepointToUtf8( hexdec( $matches[2] ) );
00970         } elseif ( $matches[3] !== '' ) {
00971             $char = $matches[3];
00972         } else {
00973             $char = '\\';
00974         }
00975         if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00976             // These characters need to be escaped in strings
00977             // Clean up the escape sequence to avoid parsing errors by clients
00978             return '\\' . dechex( ord( $char ) ) . ' ';
00979         } else {
00980             // Decode unnecessary escape
00981             return $char;
00982         }
00983     }
00984 
01004     static function fixTagAttributes( $text, $element ) {
01005         if ( trim( $text ) == '' ) {
01006             return '';
01007         }
01008 
01009         $decoded = Sanitizer::decodeTagAttributes( $text );
01010         $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
01011 
01012         return Sanitizer::safeEncodeTagAttributes( $stripped );
01013     }
01014 
01020     static function encodeAttribute( $text ) {
01021         $encValue = htmlspecialchars( $text, ENT_QUOTES );
01022 
01023         // Whitespace is normalized during attribute decoding,
01024         // so if we've been passed non-spaces we must encode them
01025         // ahead of time or they won't be preserved.
01026         $encValue = strtr( $encValue, array(
01027             "\n" => '&#10;',
01028             "\r" => '&#13;',
01029             "\t" => '&#9;',
01030         ) );
01031 
01032         return $encValue;
01033     }
01034 
01041     static function safeEncodeAttribute( $text ) {
01042         $encValue = Sanitizer::encodeAttribute( $text );
01043 
01044         # Templates and links may be expanded in later parsing,
01045         # creating invalid or dangerous output. Suppress this.
01046         $encValue = strtr( $encValue, array(
01047             '<'    => '&lt;',   // This should never happen,
01048             '>'    => '&gt;',   // we've received invalid input
01049             '"'    => '&quot;', // which should have been escaped.
01050             '{'    => '&#123;',
01051             '['    => '&#91;',
01052             "''"   => '&#39;&#39;',
01053             'ISBN' => '&#73;SBN',
01054             'RFC'  => '&#82;FC',
01055             'PMID' => '&#80;MID',
01056             '|'    => '&#124;',
01057             '__'   => '&#95;_',
01058         ) );
01059 
01060         # Stupid hack
01061         $encValue = preg_replace_callback(
01062             '/((?i)' . wfUrlProtocols() . ')/',
01063             array( 'Sanitizer', 'armorLinksCallback' ),
01064             $encValue );
01065         return $encValue;
01066     }
01067 
01099     static function escapeId( $id, $options = array() ) {
01100         global $wgExperimentalHtmlIds;
01101         $options = (array)$options;
01102 
01103         if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
01104             $id = Sanitizer::decodeCharReferences( $id );
01105             $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
01106             $id = trim( $id, '_' );
01107             if ( $id === '' ) {
01108                 # Must have been all whitespace to start with.
01109                 return '_';
01110             } else {
01111                 return $id;
01112             }
01113         }
01114 
01115         # HTML4-style escaping
01116         static $replace = array(
01117             '%3A' => ':',
01118             '%' => '.'
01119         );
01120 
01121         $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
01122         $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
01123 
01124         if ( !preg_match( '/^[a-zA-Z]/', $id )
01125         && !in_array( 'noninitial', $options ) ) {
01126             // Initial character must be a letter!
01127             $id = "x$id";
01128         }
01129         return $id;
01130     }
01131 
01143     static function escapeClass( $class ) {
01144         // Convert ugly stuff to underscores and kill underscores in ugly places
01145         return rtrim( preg_replace(
01146             array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
01147             '_',
01148             $class ), '_' );
01149     }
01150 
01158     static function escapeHtmlAllowEntities( $html ) {
01159         $html = Sanitizer::decodeCharReferences( $html );
01160         # It seems wise to escape ' as well as ", as a matter of course.  Can't
01161         # hurt.
01162         $html = htmlspecialchars( $html, ENT_QUOTES );
01163         return $html;
01164     }
01165 
01171     private static function armorLinksCallback( $matches ) {
01172         return str_replace( ':', '&#58;', $matches[1] );
01173     }
01174 
01183     public static function decodeTagAttributes( $text ) {
01184         if ( trim( $text ) == '' ) {
01185             return array();
01186         }
01187 
01188         $attribs = array();
01189         $pairs = array();
01190         if ( !preg_match_all(
01191             self::getAttribsRegex(),
01192             $text,
01193             $pairs,
01194             PREG_SET_ORDER ) ) {
01195             return $attribs;
01196         }
01197 
01198         foreach ( $pairs as $set ) {
01199             $attribute = strtolower( $set[1] );
01200             $value = Sanitizer::getTagAttributeCallback( $set );
01201 
01202             // Normalize whitespace
01203             $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
01204             $value = trim( $value );
01205 
01206             // Decode character references
01207             $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
01208         }
01209         return $attribs;
01210     }
01211 
01219     public static function safeEncodeTagAttributes( $assoc_array ) {
01220         $attribs = array();
01221         foreach ( $assoc_array as $attribute => $value ) {
01222             $encAttribute = htmlspecialchars( $attribute );
01223             $encValue = Sanitizer::safeEncodeAttribute( $value );
01224 
01225             $attribs[] = "$encAttribute=\"$encValue\"";
01226         }
01227         return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
01228     }
01229 
01238     private static function getTagAttributeCallback( $set ) {
01239         if ( isset( $set[6] ) ) {
01240             # Illegal #XXXXXX color with no quotes.
01241             return $set[6];
01242         } elseif ( isset( $set[5] ) ) {
01243             # No quotes.
01244             return $set[5];
01245         } elseif ( isset( $set[4] ) ) {
01246             # Single-quoted
01247             return $set[4];
01248         } elseif ( isset( $set[3] ) ) {
01249             # Double-quoted
01250             return $set[3];
01251         } elseif ( !isset( $set[2] ) ) {
01252             # In XHTML, attributes must have a value.
01253             # For 'reduced' form, return explicitly the attribute name here.
01254             return $set[1];
01255         } else {
01256             throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
01257         }
01258     }
01259 
01272     private static function normalizeAttributeValue( $text ) {
01273         return str_replace( '"', '&quot;',
01274             self::normalizeWhitespace(
01275                 Sanitizer::normalizeCharReferences( $text ) ) );
01276     }
01277 
01282     private static function normalizeWhitespace( $text ) {
01283         return preg_replace(
01284             '/\r\n|[\x20\x0d\x0a\x09]/',
01285             ' ',
01286             $text );
01287     }
01288 
01297     static function normalizeSectionNameWhitespace( $section ) {
01298         return trim( preg_replace( '/[ _]+/', ' ', $section ) );
01299     }
01300 
01316     static function normalizeCharReferences( $text ) {
01317         return preg_replace_callback(
01318             self::CHAR_REFS_REGEX,
01319             array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01320             $text );
01321     }
01322 
01327     static function normalizeCharReferencesCallback( $matches ) {
01328         $ret = null;
01329         if ( $matches[1] != '' ) {
01330             $ret = Sanitizer::normalizeEntity( $matches[1] );
01331         } elseif ( $matches[2] != '' ) {
01332             $ret = Sanitizer::decCharReference( $matches[2] );
01333         } elseif ( $matches[3] != '' ) {
01334             $ret = Sanitizer::hexCharReference( $matches[3] );
01335         }
01336         if ( is_null( $ret ) ) {
01337             return htmlspecialchars( $matches[0] );
01338         } else {
01339             return $ret;
01340         }
01341     }
01342 
01353     static function normalizeEntity( $name ) {
01354         if ( isset( self::$htmlEntityAliases[$name] ) ) {
01355             return '&' . self::$htmlEntityAliases[$name] . ';';
01356         } elseif ( in_array( $name,
01357         array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
01358             return "&$name;";
01359         } elseif ( isset( self::$htmlEntities[$name] ) ) {
01360             return '&#' . self::$htmlEntities[$name] . ';';
01361         } else {
01362             return "&amp;$name;";
01363         }
01364     }
01365 
01370     static function decCharReference( $codepoint ) {
01371         $point = intval( $codepoint );
01372         if ( Sanitizer::validateCodepoint( $point ) ) {
01373             return sprintf( '&#%d;', $point );
01374         } else {
01375             return null;
01376         }
01377     }
01378 
01383     static function hexCharReference( $codepoint ) {
01384         $point = hexdec( $codepoint );
01385         if ( Sanitizer::validateCodepoint( $point ) ) {
01386             return sprintf( '&#x%x;', $point );
01387         } else {
01388             return null;
01389         }
01390     }
01391 
01397     private static function validateCodepoint( $codepoint ) {
01398         return $codepoint == 0x09
01399             || $codepoint == 0x0a
01400             || $codepoint == 0x0d
01401             || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
01402             || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
01403             || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
01404     }
01405 
01413     public static function decodeCharReferences( $text ) {
01414         return preg_replace_callback(
01415             self::CHAR_REFS_REGEX,
01416             array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01417             $text );
01418     }
01419 
01430     public static function decodeCharReferencesAndNormalize( $text ) {
01431         global $wgContLang;
01432         $text = preg_replace_callback(
01433             self::CHAR_REFS_REGEX,
01434             array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01435             $text, /* limit */ -1, $count );
01436 
01437         if ( $count ) {
01438             return $wgContLang->normalize( $text );
01439         } else {
01440             return $text;
01441         }
01442     }
01443 
01448     static function decodeCharReferencesCallback( $matches ) {
01449         if ( $matches[1] != '' ) {
01450             return Sanitizer::decodeEntity( $matches[1] );
01451         } elseif ( $matches[2] != '' ) {
01452             return Sanitizer::decodeChar( intval( $matches[2] ) );
01453         } elseif ( $matches[3] != '' ) {
01454             return Sanitizer::decodeChar( hexdec( $matches[3] ) );
01455         }
01456         # Last case should be an ampersand by itself
01457         return $matches[0];
01458     }
01459 
01467     static function decodeChar( $codepoint ) {
01468         if ( Sanitizer::validateCodepoint( $codepoint ) ) {
01469             return codepointToUtf8( $codepoint );
01470         } else {
01471             return UTF8_REPLACEMENT;
01472         }
01473     }
01474 
01483     static function decodeEntity( $name ) {
01484         if ( isset( self::$htmlEntityAliases[$name] ) ) {
01485             $name = self::$htmlEntityAliases[$name];
01486         }
01487         if ( isset( self::$htmlEntities[$name] ) ) {
01488             return codepointToUtf8( self::$htmlEntities[$name] );
01489         } else {
01490             return "&$name;";
01491         }
01492     }
01493 
01500     static function attributeWhitelist( $element ) {
01501         $list = Sanitizer::setupAttributeWhitelist();
01502         return isset( $list[$element] )
01503             ? $list[$element]
01504             : array();
01505     }
01506 
01512     static function setupAttributeWhitelist() {
01513         global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
01514 
01515         static $whitelist, $staticInitialised;
01516         $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
01517 
01518         if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
01519             return $whitelist;
01520         }
01521 
01522         $common = array(
01523             # HTML
01524             'id',
01525             'class',
01526             'style',
01527             'lang',
01528             'dir',
01529             'title',
01530 
01531             # WAI-ARIA
01532             'role',
01533         );
01534 
01535         if ( $wgAllowRdfaAttributes ) {
01536             # RDFa attributes as specified in section 9 of
01537             # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
01538             $common = array_merge( $common, array(
01539                 'about', 'property', 'resource', 'datatype', 'typeof',
01540             ) );
01541         }
01542 
01543         if ( $wgAllowMicrodataAttributes ) {
01544             # add HTML5 microdata tags as specified by
01545             # http://www.whatwg.org/html/microdata.html#the-microdata-model
01546             $common = array_merge( $common, array(
01547                 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
01548             ) );
01549         }
01550 
01551         $block = array_merge( $common, array( 'align' ) );
01552         $tablealign = array( 'align', 'valign' );
01553         $tablecell = array(
01554             'abbr',
01555             'axis',
01556             'headers',
01557             'scope',
01558             'rowspan',
01559             'colspan',
01560             'nowrap', # deprecated
01561             'width', # deprecated
01562             'height', # deprecated
01563             'bgcolor', # deprecated
01564         );
01565 
01566         # Numbers refer to sections in HTML 4.01 standard describing the element.
01567         # See: http://www.w3.org/TR/html4/
01568         $whitelist = array(
01569             # 7.5.4
01570             'div'        => $block,
01571             'center'     => $common, # deprecated
01572             'span'       => $common,
01573 
01574             # 7.5.5
01575             'h1'         => $block,
01576             'h2'         => $block,
01577             'h3'         => $block,
01578             'h4'         => $block,
01579             'h5'         => $block,
01580             'h6'         => $block,
01581 
01582             # 7.5.6
01583             # address
01584 
01585             # 8.2.4
01586             'bdo'        => $common,
01587 
01588             # 9.2.1
01589             'em'         => $common,
01590             'strong'     => $common,
01591             'cite'       => $common,
01592             'dfn'        => $common,
01593             'code'       => $common,
01594             'samp'       => $common,
01595             'kbd'        => $common,
01596             'var'        => $common,
01597             'abbr'       => $common,
01598             # acronym
01599 
01600             # 9.2.2
01601             'blockquote' => array_merge( $common, array( 'cite' ) ),
01602             'q'          => array_merge( $common, array( 'cite' ) ),
01603 
01604             # 9.2.3
01605             'sub'        => $common,
01606             'sup'        => $common,
01607 
01608             # 9.3.1
01609             'p'          => $block,
01610 
01611             # 9.3.2
01612             'br'         => array_merge( $common, array( 'clear' ) ),
01613 
01614             # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
01615             'wbr'        => $common,
01616 
01617             # 9.3.4
01618             'pre'        => array_merge( $common, array( 'width' ) ),
01619 
01620             # 9.4
01621             'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01622             'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01623 
01624             # 10.2
01625             'ul'         => array_merge( $common, array( 'type' ) ),
01626             'ol'         => array_merge( $common, array( 'type', 'start' ) ),
01627             'li'         => array_merge( $common, array( 'type', 'value' ) ),
01628 
01629             # 10.3
01630             'dl'         => $common,
01631             'dd'         => $common,
01632             'dt'         => $common,
01633 
01634             # 11.2.1
01635             'table'      => array_merge( $common,
01636                                 array( 'summary', 'width', 'border', 'frame',
01637                                         'rules', 'cellspacing', 'cellpadding',
01638                                         'align', 'bgcolor',
01639                                 ) ),
01640 
01641             # 11.2.2
01642             'caption'    => $block,
01643 
01644             # 11.2.3
01645             'thead'      => $common,
01646             'tfoot'      => $common,
01647             'tbody'      => $common,
01648 
01649             # 11.2.4
01650             'colgroup'   => array_merge( $common, array( 'span' ) ),
01651             'col'        => array_merge( $common, array( 'span' ) ),
01652 
01653             # 11.2.5
01654             'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01655 
01656             # 11.2.6
01657             'td'         => array_merge( $common, $tablecell, $tablealign ),
01658             'th'         => array_merge( $common, $tablecell, $tablealign ),
01659 
01660             # 12.2
01661             # NOTE: <a> is not allowed directly, but the attrib
01662             # whitelist is used from the Parser object
01663             'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
01664 
01665             # 13.2
01666             # Not usually allowed, but may be used for extension-style hooks
01667             # such as <math> when it is rasterized, or if $wgAllowImageTag is
01668             # true
01669             'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
01670 
01671             # 15.2.1
01672             'tt'         => $common,
01673             'b'          => $common,
01674             'i'          => $common,
01675             'big'        => $common,
01676             'small'      => $common,
01677             'strike'     => $common,
01678             's'          => $common,
01679             'u'          => $common,
01680 
01681             # 15.2.2
01682             'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
01683             # basefont
01684 
01685             # 15.3
01686             'hr'         => array_merge( $common, array( 'width' ) ),
01687 
01688             # HTML Ruby annotation text module, simple ruby only.
01689             # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
01690             'ruby'       => $common,
01691             # rbc
01692             # rtc
01693             'rb'         => $common,
01694             'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
01695             'rp'         => $common,
01696 
01697             # MathML root element, where used for extensions
01698             # 'title' may not be 100% valid here; it's XHTML
01699             # http://www.w3.org/TR/REC-MathML/
01700             'math'       => array( 'class', 'style', 'id', 'title' ),
01701 
01702             # HTML 5 section 4.6
01703             'bdi' => $common,
01704 
01705             # HTML5 elements, defined by:
01706             # http://www.whatwg.org/html/
01707             'data' => array_merge( $common, array( 'value' ) ),
01708             'time' => array_merge( $common, array( 'datetime' ) ),
01709             'mark' => $common,
01710 
01711             // meta and link are only permitted by removeHTMLtags when Microdata
01712             // is enabled so we don't bother adding a conditional to hide these
01713             // Also meta and link are only valid in WikiText as Microdata elements
01714             // (ie: validateTag rejects tags missing the attributes needed for Microdata)
01715             // So we don't bother including $common attributes that have no purpose.
01716             'meta' => array( 'itemprop', 'content' ),
01717             'link' => array( 'itemprop', 'href' ),
01718         );
01719 
01720         $staticInitialised = $globalContext;
01721 
01722         return $whitelist;
01723     }
01724 
01735     static function stripAllTags( $text ) {
01736         # Actual <tags>
01737         $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01738 
01739         # Normalize &entities and whitespace
01740         $text = self::decodeCharReferences( $text );
01741         $text = self::normalizeWhitespace( $text );
01742 
01743         return $text;
01744     }
01745 
01755     static function hackDocType() {
01756         $out = "<!DOCTYPE html [\n";
01757         foreach ( self::$htmlEntities as $entity => $codepoint ) {
01758             $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01759         }
01760         $out .= "]>\n";
01761         return $out;
01762     }
01763 
01768     static function cleanUrl( $url ) {
01769         # Normalize any HTML entities in input. They will be
01770         # re-escaped by makeExternalLink().
01771         $url = Sanitizer::decodeCharReferences( $url );
01772 
01773         # Escape any control characters introduced by the above step
01774         $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
01775             array( __CLASS__, 'cleanUrlCallback' ), $url );
01776 
01777         # Validate hostname portion
01778         $matches = array();
01779         if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01780             list( /* $whole */, $protocol, $host, $rest ) = $matches;
01781 
01782             // Characters that will be ignored in IDNs.
01783             // http://tools.ietf.org/html/3454#section-3.1
01784             // Strip them before further processing so blacklists and such work.
01785             $strip = "/
01786                 \\s|          # general whitespace
01787                 \xc2\xad|     # 00ad SOFT HYPHEN
01788                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01789                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01790                 \xe2\x81\xa0| # 2060 WORD JOINER
01791                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01792                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
01793                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01794                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01795                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01796                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01797                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01798                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
01799                 /xuD";
01800 
01801             $host = preg_replace( $strip, '', $host );
01802 
01803             // @todo FIXME: Validate hostnames here
01804 
01805             return $protocol . $host . $rest;
01806         } else {
01807             return $url;
01808         }
01809     }
01810 
01815     static function cleanUrlCallback( $matches ) {
01816         return urlencode( $matches[0] );
01817     }
01818 
01847     public static function validateEmail( $addr ) {
01848         $result = null;
01849         if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
01850             return $result;
01851         }
01852 
01853         // Please note strings below are enclosed in brackets [], this make the
01854         // hyphen "-" a range indicator. Hence it is double backslashed below.
01855         // See bug 26948
01856         $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
01857         $rfc1034_ldh_str = "a-z0-9\\-";
01858 
01859         $html5_email_regexp = "/
01860         ^                      # start of string
01861         [$rfc5322_atext\\.]+    # user part which is liberal :p
01862         @                      # 'apostrophe'
01863         [$rfc1034_ldh_str]+       # First domain part
01864         (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
01865         $                      # End of string
01866         /ix"; // case Insensitive, eXtended
01867 
01868         return (bool)preg_match( $html5_email_regexp, $addr );
01869     }
01870 }