php/html/Sanitizer_8php_source.html

00001 <?php
00031 class Sanitizer {
00036     const CHAR_REFS_REGEX =
00037         '/&([A-Za-z0-9\x80-\xff]+);
00038          |&\#([0-9]+);
00039          |&\#[xX]([0-9A-Fa-f]+);
00040          |(&)/x';
00041
00050     const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
00051     const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
00052
00058     private static $htmlEntities = array(
00059         'Aacute'   => 193,
00060         'aacute'   => 225,
00061         'Acirc'    => 194,
00062         'acirc'    => 226,
00063         'acute'    => 180,
00064         'AElig'    => 198,
00065         'aelig'    => 230,
00066         'Agrave'   => 192,
00067         'agrave'   => 224,
00068         'alefsym'  => 8501,
00069         'Alpha'    => 913,
00070         'alpha'    => 945,
00071         'amp'      => 38,
00072         'and'      => 8743,
00073         'ang'      => 8736,
00074         'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
00075         'Aring'    => 197,
00076         'aring'    => 229,
00077         'asymp'    => 8776,
00078         'Atilde'   => 195,
00079         'atilde'   => 227,
00080         'Auml'     => 196,
00081         'auml'     => 228,
00082         'bdquo'    => 8222,
00083         'Beta'     => 914,
00084         'beta'     => 946,
00085         'brvbar'   => 166,
00086         'bull'     => 8226,
00087         'cap'      => 8745,
00088         'Ccedil'   => 199,
00089         'ccedil'   => 231,
00090         'cedil'    => 184,
00091         'cent'     => 162,
00092         'Chi'      => 935,
00093         'chi'      => 967,
00094         'circ'     => 710,
00095         'clubs'    => 9827,
00096         'cong'     => 8773,
00097         'copy'     => 169,
00098         'crarr'    => 8629,
00099         'cup'      => 8746,
00100         'curren'   => 164,
00101         'dagger'   => 8224,
00102         'Dagger'   => 8225,
00103         'darr'     => 8595,
00104         'dArr'     => 8659,
00105         'deg'      => 176,
00106         'Delta'    => 916,
00107         'delta'    => 948,
00108         'diams'    => 9830,
00109         'divide'   => 247,
00110         'Eacute'   => 201,
00111         'eacute'   => 233,
00112         'Ecirc'    => 202,
00113         'ecirc'    => 234,
00114         'Egrave'   => 200,
00115         'egrave'   => 232,
00116         'empty'    => 8709,
00117         'emsp'     => 8195,
00118         'ensp'     => 8194,
00119         'Epsilon'  => 917,
00120         'epsilon'  => 949,
00121         'equiv'    => 8801,
00122         'Eta'      => 919,
00123         'eta'      => 951,
00124         'ETH'      => 208,
00125         'eth'      => 240,
00126         'Euml'     => 203,
00127         'euml'     => 235,
00128         'euro'     => 8364,
00129         'exist'    => 8707,
00130         'fnof'     => 402,
00131         'forall'   => 8704,
00132         'frac12'   => 189,
00133         'frac14'   => 188,
00134         'frac34'   => 190,
00135         'frasl'    => 8260,
00136         'Gamma'    => 915,
00137         'gamma'    => 947,
00138         'ge'       => 8805,
00139         'gt'       => 62,
00140         'harr'     => 8596,
00141         'hArr'     => 8660,
00142         'hearts'   => 9829,
00143         'hellip'   => 8230,
00144         'Iacute'   => 205,
00145         'iacute'   => 237,
00146         'Icirc'    => 206,
00147         'icirc'    => 238,
00148         'iexcl'    => 161,
00149         'Igrave'   => 204,
00150         'igrave'   => 236,
00151         'image'    => 8465,
00152         'infin'    => 8734,
00153         'int'      => 8747,
00154         'Iota'     => 921,
00155         'iota'     => 953,
00156         'iquest'   => 191,
00157         'isin'     => 8712,
00158         'Iuml'     => 207,
00159         'iuml'     => 239,
00160         'Kappa'    => 922,
00161         'kappa'    => 954,
00162         'Lambda'   => 923,
00163         'lambda'   => 955,
00164         'lang'     => 9001,
00165         'laquo'    => 171,
00166         'larr'     => 8592,
00167         'lArr'     => 8656,
00168         'lceil'    => 8968,
00169         'ldquo'    => 8220,
00170         'le'       => 8804,
00171         'lfloor'   => 8970,
00172         'lowast'   => 8727,
00173         'loz'      => 9674,
00174         'lrm'      => 8206,
00175         'lsaquo'   => 8249,
00176         'lsquo'    => 8216,
00177         'lt'       => 60,
00178         'macr'     => 175,
00179         'mdash'    => 8212,
00180         'micro'    => 181,
00181         'middot'   => 183,
00182         'minus'    => 8722,
00183         'Mu'       => 924,
00184         'mu'       => 956,
00185         'nabla'    => 8711,
00186         'nbsp'     => 160,
00187         'ndash'    => 8211,
00188         'ne'       => 8800,
00189         'ni'       => 8715,
00190         'not'      => 172,
00191         'notin'    => 8713,
00192         'nsub'     => 8836,
00193         'Ntilde'   => 209,
00194         'ntilde'   => 241,
00195         'Nu'       => 925,
00196         'nu'       => 957,
00197         'Oacute'   => 211,
00198         'oacute'   => 243,
00199         'Ocirc'    => 212,
00200         'ocirc'    => 244,
00201         'OElig'    => 338,
00202         'oelig'    => 339,
00203         'Ograve'   => 210,
00204         'ograve'   => 242,
00205         'oline'    => 8254,
00206         'Omega'    => 937,
00207         'omega'    => 969,
00208         'Omicron'  => 927,
00209         'omicron'  => 959,
00210         'oplus'    => 8853,
00211         'or'       => 8744,
00212         'ordf'     => 170,
00213         'ordm'     => 186,
00214         'Oslash'   => 216,
00215         'oslash'   => 248,
00216         'Otilde'   => 213,
00217         'otilde'   => 245,
00218         'otimes'   => 8855,
00219         'Ouml'     => 214,
00220         'ouml'     => 246,
00221         'para'     => 182,
00222         'part'     => 8706,
00223         'permil'   => 8240,
00224         'perp'     => 8869,
00225         'Phi'      => 934,
00226         'phi'      => 966,
00227         'Pi'       => 928,
00228         'pi'       => 960,
00229         'piv'      => 982,
00230         'plusmn'   => 177,
00231         'pound'    => 163,
00232         'prime'    => 8242,
00233         'Prime'    => 8243,
00234         'prod'     => 8719,
00235         'prop'     => 8733,
00236         'Psi'      => 936,
00237         'psi'      => 968,
00238         'quot'     => 34,
00239         'radic'    => 8730,
00240         'rang'     => 9002,
00241         'raquo'    => 187,
00242         'rarr'     => 8594,
00243         'rArr'     => 8658,
00244         'rceil'    => 8969,
00245         'rdquo'    => 8221,
00246         'real'     => 8476,
00247         'reg'      => 174,
00248         'rfloor'   => 8971,
00249         'Rho'      => 929,
00250         'rho'      => 961,
00251         'rlm'      => 8207,
00252         'rsaquo'   => 8250,
00253         'rsquo'    => 8217,
00254         'sbquo'    => 8218,
00255         'Scaron'   => 352,
00256         'scaron'   => 353,
00257         'sdot'     => 8901,
00258         'sect'     => 167,
00259         'shy'      => 173,
00260         'Sigma'    => 931,
00261         'sigma'    => 963,
00262         'sigmaf'   => 962,
00263         'sim'      => 8764,
00264         'spades'   => 9824,
00265         'sub'      => 8834,
00266         'sube'     => 8838,
00267         'sum'      => 8721,
00268         'sup'      => 8835,
00269         'sup1'     => 185,
00270         'sup2'     => 178,
00271         'sup3'     => 179,
00272         'supe'     => 8839,
00273         'szlig'    => 223,
00274         'Tau'      => 932,
00275         'tau'      => 964,
00276         'there4'   => 8756,
00277         'Theta'    => 920,
00278         'theta'    => 952,
00279         'thetasym' => 977,
00280         'thinsp'   => 8201,
00281         'THORN'    => 222,
00282         'thorn'    => 254,
00283         'tilde'    => 732,
00284         'times'    => 215,
00285         'trade'    => 8482,
00286         'Uacute'   => 218,
00287         'uacute'   => 250,
00288         'uarr'     => 8593,
00289         'uArr'     => 8657,
00290         'Ucirc'    => 219,
00291         'ucirc'    => 251,
00292         'Ugrave'   => 217,
00293         'ugrave'   => 249,
00294         'uml'      => 168,
00295         'upsih'    => 978,
00296         'Upsilon'  => 933,
00297         'upsilon'  => 965,
00298         'Uuml'     => 220,
00299         'uuml'     => 252,
00300         'weierp'   => 8472,
00301         'Xi'       => 926,
00302         'xi'       => 958,
00303         'Yacute'   => 221,
00304         'yacute'   => 253,
00305         'yen'      => 165,
00306         'Yuml'     => 376,
00307         'yuml'     => 255,
00308         'Zeta'     => 918,
00309         'zeta'     => 950,
00310         'zwj'      => 8205,
00311         'zwnj'     => 8204
00312     );
00313
00317     private static $htmlEntityAliases = array(
00318         'רלמ' => 'rlm',
00319         'رلم' => 'rlm',
00320     );
00321
00325     private static $attribsRegex;
00326
00333     static function getAttribsRegex() {
00334         if ( self::$attribsRegex === null ) {
00335             $attribFirst = '[:A-Z_a-z0-9]';
00336             $attrib = '[:A-Z_a-z-.0-9]';
00337             $space = '[\x09\x0a\x0d\x20]';
00338             self::$attribsRegex =
00339                 "/(?:^|$space)({$attribFirst}{$attrib}*)
00340                   ($space*=$space*
00341                     (?:
00342                      # The attribute value: quoted or alone
00343                       \"([^<\"]*)\"
00344                      | '([^<']*)'
00345                      |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00346                      |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00347                                          # colors are specified like this.
00348                                          # We'll be normalizing it.
00349                     )
00350                 )?(?=$space|\$)/sx";
00351         }
00352         return self::$attribsRegex;
00353     }
00354
00367     static function removeHTMLtags( $text, $processCallback = null,
00368         $args = array(), $extratags = array(), $removetags = array()
00369     ) {
00370         global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
00371
00372         static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00373             $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
00374
00375         wfProfileIn( __METHOD__ );
00376
00377         // Base our staticInitialised variable off of the global config state so that if the globals
00378         // are changed (like in the screwed up test system) we will re-initialise the settings.
00379         $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
00380         if ( !$staticInitialised || $staticInitialised != $globalContext ) {
00381
00382             $htmlpairsStatic = array( # Tags that must be closed
00383                 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00384                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00385                 'strike', 'strong', 'tt', 'var', 'div', 'center',
00386                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00387                 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
00388                 'kbd', 'samp', 'data', 'time', 'mark'
00389             );
00390             $htmlsingle = array(
00391                 'br', 'wbr', 'hr', 'li', 'dt', 'dd'
00392             );
00393             $htmlsingleonly = array( # Elements that cannot have close tags
00394                 'br', 'wbr', 'hr'
00395             );
00396             if ( $wgAllowMicrodataAttributes ) {
00397                 $htmlsingle[] = $htmlsingleonly[] = 'meta';
00398                 $htmlsingle[] = $htmlsingleonly[] = 'link';
00399             }
00400             $htmlnest = array( # Tags that can be nested--??
00401                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00402                 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
00403                 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
00404             );
00405             $tabletags = array( # Can only appear inside table, we will close them
00406                 'td', 'th', 'tr',
00407             );
00408             $htmllist = array( # Tags used by list
00409                 'ul', 'ol',
00410             );
00411             $listtags = array( # Tags that can appear in a list
00412                 'li',
00413             );
00414
00415             if ( $wgAllowImageTag ) {
00416                 $htmlsingle[] = 'img';
00417                 $htmlsingleonly[] = 'img';
00418             }
00419
00420             $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
00421             $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
00422
00423             # Convert them all to hashtables for faster lookup
00424             $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00425                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
00426             foreach ( $vars as $var ) {
00427                 $$var = array_flip( $$var );
00428             }
00429             $staticInitialised = $globalContext;
00430         }
00431         # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
00432         $extratags = array_flip( $extratags );
00433         $removetags = array_flip( $removetags );
00434         $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
00435         $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
00436
00437         # Remove HTML comments
00438         $text = Sanitizer::removeHTMLcomments( $text );
00439         $bits = explode( '<', $text );
00440         $text = str_replace( '>', '&gt;', array_shift( $bits ) );
00441         if ( !$wgUseTidy ) {
00442             $tagstack = $tablestack = array();
00443             foreach ( $bits as $x ) {
00444                 $regs = array();
00445                 # $slash: Does the current element start with a '/'?
00446                 # $t: Current element name
00447                 # $params: String between element name and >
00448                 # $brace: Ending '>' or '/>'
00449                 # $rest: Everything until the next element of $bits
00450                 if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00451                     list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00452                 } else {
00453                     $slash = $t = $params = $brace = $rest = null;
00454                 }
00455
00456                 $badtag = false;
00457                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00458                     # Check our stack
00459                     if ( $slash && isset( $htmlsingleonly[$t] ) ) {
00460                         $badtag = true;
00461                     } elseif ( $slash ) {
00462                         # Closing a tag... is it the one we just opened?
00463                         wfSuppressWarnings();
00464                         $ot = array_pop( $tagstack );
00465                         wfRestoreWarnings();
00466
00467                         if ( $ot != $t ) {
00468                             if ( isset( $htmlsingleallowed[$ot] ) ) {
00469                                 # Pop all elements with an optional close tag
00470                                 # and see if we find a match below them
00471                                 $optstack = array();
00472                                 array_push( $optstack, $ot );
00473                                 wfSuppressWarnings();
00474                                 $ot = array_pop( $tagstack );
00475                                 wfRestoreWarnings();
00476                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
00477                                     array_push( $optstack, $ot );
00478                                     wfSuppressWarnings();
00479                                     $ot = array_pop( $tagstack );
00480                                     wfRestoreWarnings();
00481                                 }
00482                                 if ( $t != $ot ) {
00483                                     # No match. Push the optional elements back again
00484                                     $badtag = true;
00485                                     wfSuppressWarnings();
00486                                     $ot = array_pop( $optstack );
00487                                     wfRestoreWarnings();
00488                                     while ( $ot ) {
00489                                         array_push( $tagstack, $ot );
00490                                         wfSuppressWarnings();
00491                                         $ot = array_pop( $optstack );
00492                                         wfRestoreWarnings();
00493                                     }
00494                                 }
00495                             } else {
00496                                 wfSuppressWarnings();
00497                                 array_push( $tagstack, $ot );
00498                                 wfRestoreWarnings();
00499
00500                                 # <li> can be nested in <ul> or <ol>, skip those cases:
00501                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
00502                                     $badtag = true;
00503                                 }
00504                             }
00505                         } else {
00506                             if ( $t == 'table' ) {
00507                                 $tagstack = array_pop( $tablestack );
00508                             }
00509                         }
00510                         $newparams = '';
00511                     } else {
00512                         # Keep track for later
00513                         if ( isset( $tabletags[$t] ) &&
00514                         !in_array( 'table', $tagstack ) ) {
00515                             $badtag = true;
00516                         } elseif ( in_array( $t, $tagstack ) &&
00517                         !isset( $htmlnest[$t] ) ) {
00518                             $badtag = true;
00519                         # Is it a self closed htmlpair ? (bug 5487)
00520                         } elseif ( $brace == '/>' &&
00521                         isset( $htmlpairs[$t] ) ) {
00522                             $badtag = true;
00523                         } elseif ( isset( $htmlsingleonly[$t] ) ) {
00524                             # Hack to force empty tag for unclosable elements
00525                             $brace = '/>';
00526                         } elseif ( isset( $htmlsingle[$t] ) ) {
00527                             # Hack to not close $htmlsingle tags
00528                             $brace = null;
00529                             # Still need to push this optionally-closed tag to
00530                             # the tag stack so that we can match end tags
00531                             # instead of marking them as bad.
00532                             array_push( $tagstack, $t );
00533                         } elseif ( isset( $tabletags[$t] )
00534                         && in_array( $t, $tagstack ) ) {
00535                             // New table tag but forgot to close the previous one
00536                             $text .= "</$t>";
00537                         } else {
00538                             if ( $t == 'table' ) {
00539                                 array_push( $tablestack, $tagstack );
00540                                 $tagstack = array();
00541                             }
00542                             array_push( $tagstack, $t );
00543                         }
00544
00545                         # Replace any variables or template parameters with
00546                         # plaintext results.
00547                         if ( is_callable( $processCallback ) ) {
00548                             call_user_func_array( $processCallback, array( &$params, $args ) );
00549                         }
00550
00551                         if ( !Sanitizer::validateTag( $params, $t ) ) {
00552                             $badtag = true;
00553                         }
00554
00555                         # Strip non-approved attributes from the tag
00556                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
00557                     }
00558                     if ( !$badtag ) {
00559                         $rest = str_replace( '>', '&gt;', $rest );
00560                         $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00561                         $text .= "<$slash$t$newparams$close>$rest";
00562                         continue;
00563                     }
00564                 }
00565                 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
00566             }
00567             # Close off any remaining tags
00568             while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
00569                 $text .= "</$t>\n";
00570                 if ( $t == 'table' ) {
00571                     $tagstack = array_pop( $tablestack );
00572                 }
00573             }
00574         } else {
00575             # this might be possible using tidy itself
00576             foreach ( $bits as $x ) {
00577                 preg_match(
00578                     '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00579                     $x,
00580                     $regs
00581                 );
00582
00583                 wfSuppressWarnings();
00584                 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00585                 wfRestoreWarnings();
00586
00587                 $badtag = false;
00588                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00589                     if ( is_callable( $processCallback ) ) {
00590                         call_user_func_array( $processCallback, array( &$params, $args ) );
00591                     }
00592
00593                     if ( !Sanitizer::validateTag( $params, $t ) ) {
00594                         $badtag = true;
00595                     }
00596
00597                     $newparams = Sanitizer::fixTagAttributes( $params, $t );
00598                     if ( !$badtag ) {
00599                         $rest = str_replace( '>', '&gt;', $rest );
00600                         $text .= "<$slash$t$newparams$brace$rest";
00601                         continue;
00602                     }
00603                 }
00604                 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
00605             }
00606         }
00607         wfProfileOut( __METHOD__ );
00608         return $text;
00609     }
00610
00621     static function removeHTMLcomments( $text ) {
00622         wfProfileIn( __METHOD__ );
00623         while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
00624             $end = strpos( $text, '-->', $start + 4 );
00625             if ( $end === false ) {
00626                 # Unterminated comment; bail out
00627                 break;
00628             }
00629
00630             $end += 3;
00631
00632             # Trim space and newline if the comment is both
00633             # preceded and followed by a newline
00634             $spaceStart = max( $start - 1, 0 );
00635             $spaceLen = $end - $spaceStart;
00636             while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
00637                 $spaceStart--;
00638                 $spaceLen++;
00639             }
00640             while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
00641                 $spaceLen++;
00642             }
00643             if ( substr( $text, $spaceStart, 1 ) === "\n"
00644                 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
00645                 # Remove the comment, leading and trailing
00646                 # spaces, and leave only one newline.
00647                 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
00648             } else {
00649                 # Remove just the comment.
00650                 $text = substr_replace( $text, '', $start, $end - $start );
00651             }
00652         }
00653         wfProfileOut( __METHOD__ );
00654         return $text;
00655     }
00656
00669     static function validateTag( $params, $element ) {
00670         $params = Sanitizer::decodeTagAttributes( $params );
00671
00672         if ( $element == 'meta' || $element == 'link' ) {
00673             if ( !isset( $params['itemprop'] ) ) {
00674                 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
00675                 return false;
00676             }
00677             if ( $element == 'meta' && !isset( $params['content'] ) ) {
00678                 // <meta> must have a content="" for the itemprop
00679                 return false;
00680             }
00681             if ( $element == 'link' && !isset( $params['href'] ) ) {
00682                 // <link> must have an associated href=""
00683                 return false;
00684             }
00685         }
00686
00687         return true;
00688     }
00689
00705     static function validateTagAttributes( $attribs, $element ) {
00706         return Sanitizer::validateAttributes( $attribs,
00707             Sanitizer::attributeWhitelist( $element ) );
00708     }
00709
00725     static function validateAttributes( $attribs, $whitelist ) {
00726         global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
00727
00728         $whitelist = array_flip( $whitelist );
00729         $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
00730
00731         $out = array();
00732         foreach ( $attribs as $attribute => $value ) {
00733             #allow XML namespace declaration if RDFa is enabled
00734             if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
00735                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00736                     $out[$attribute] = $value;
00737                 }
00738
00739                 continue;
00740             }
00741
00742             # Allow any attribute beginning with "data-"
00743             if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
00744                 continue;
00745             }
00746
00747             # Strip javascript "expression" from stylesheets.
00748             # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00749             if ( $attribute == 'style' ) {
00750                 $value = Sanitizer::checkCss( $value );
00751             }
00752
00753             if ( $attribute === 'id' ) {
00754                 $value = Sanitizer::escapeId( $value, 'noninitial' );
00755             }
00756
00757             # WAI-ARIA
00758             # http://www.w3.org/TR/wai-aria/
00759             # http://www.whatwg.org/html/elements.html#wai-aria
00760             # For now we only support role="presentation" until we work out what roles should be
00761             # usable by content and we ensure that our code explicitly rejects patterns that
00762             # violate HTML5's ARIA restrictions.
00763             if ( $attribute === 'role' && $value !== 'presentation' ) {
00764                 continue;
00765             }
00766
00767             // RDFa and microdata properties allow URLs, URIs and/or CURIs.
00768             // Check them for sanity.
00769             if ( $attribute === 'rel' || $attribute === 'rev'
00770                 # RDFa
00771                 || $attribute === 'about' || $attribute === 'property'
00772                 || $attribute === 'resource' || $attribute === 'datatype'
00773                 || $attribute === 'typeof'
00774                 # HTML5 microdata
00775                 || $attribute === 'itemid' || $attribute === 'itemprop'
00776                 || $attribute === 'itemref' || $attribute === 'itemscope'
00777                 || $attribute === 'itemtype'
00778             ) {
00779                 //Paranoia. Allow "simple" values but suppress javascript
00780                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00781                     continue;
00782                 }
00783             }
00784
00785             # NOTE: even though elements using href/src are not allowed directly, supply
00786             #       validation code that can be used by tag hook handlers, etc
00787             if ( $attribute === 'href' || $attribute === 'src' ) {
00788                 if ( !preg_match( $hrefExp, $value ) ) {
00789                     continue; //drop any href or src attributes not using an allowed protocol.
00790                     // NOTE: this also drops all relative URLs
00791                 }
00792             }
00793
00794             // If this attribute was previously set, override it.
00795             // Output should only have one attribute of each name.
00796             $out[$attribute] = $value;
00797         }
00798
00799         if ( $wgAllowMicrodataAttributes ) {
00800             # itemtype, itemid, itemref don't make sense without itemscope
00801             if ( !array_key_exists( 'itemscope', $out ) ) {
00802                 unset( $out['itemtype'] );
00803                 unset( $out['itemid'] );
00804                 unset( $out['itemref'] );
00805             }
00806             # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
00807         }
00808         return $out;
00809     }
00810
00821     static function mergeAttributes( $a, $b ) {
00822         $out = array_merge( $a, $b );
00823         if ( isset( $a['class'] ) && isset( $b['class'] )
00824             && is_string( $a['class'] ) && is_string( $b['class'] )
00825             && $a['class'] !== $b['class']
00826         ) {
00827             $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00828                 -1, PREG_SPLIT_NO_EMPTY );
00829             $out['class'] = implode( ' ', array_unique( $classes ) );
00830         }
00831         return $out;
00832     }
00833
00843     public static function normalizeCss( $value ) {
00844
00845         // Decode character references like &#123;
00846         $value = Sanitizer::decodeCharReferences( $value );
00847
00848         // Decode escape sequences and line continuation
00849         // See the grammar in the CSS 2 spec, appendix D.
00850         // This has to be done AFTER decoding character references.
00851         // This means it isn't possible for this function to return
00852         // unsanitized escape sequences. It is possible to manufacture
00853         // input that contains character references that decode to
00854         // escape sequences that decode to character references, but
00855         // it's OK for the return value to contain character references
00856         // because the caller is supposed to escape those anyway.
00857         static $decodeRegex;
00858         if ( !$decodeRegex ) {
00859             $space = '[\\x20\\t\\r\\n\\f]';
00860             $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00861             $backslash = '\\\\';
00862             $decodeRegex = "/ $backslash
00863                 (?:
00864                     ($nl) |  # 1. Line continuation
00865                     ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
00866                     (.) | # 3. backslash cancelling special meaning
00867                     () | # 4. backslash at end of string
00868                 )/xu";
00869         }
00870         $value = preg_replace_callback( $decodeRegex,
00871             array( __CLASS__, 'cssDecodeCallback' ), $value );
00872
00873         // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
00874         $value = preg_replace_callback(
00875             '/[！-［］-ｚ]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
00876             function ( $matches ) {
00877                 $cp = utf8ToCodepoint( $matches[0] );
00878                 if ( $cp === false ) {
00879                     return '';
00880                 }
00881                 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
00882             },
00883             $value
00884         );
00885
00886         // Convert more characters IE6 might treat as ascii
00887         // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
00888         $value = str_replace(
00889             array( 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ),
00890             array( 'r', 'n', 'n', 'l', 'i', '(', '(' ),
00891             $value
00892         );
00893
00894         // Let the value through if it's nothing but a single comment, to
00895         // allow other functions which may reject it to pass some error
00896         // message through.
00897         if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
00898             // Remove any comments; IE gets token splitting wrong
00899             // This must be done AFTER decoding character references and
00900             // escape sequences, because those steps can introduce comments
00901             // This step cannot introduce character references or escape
00902             // sequences, because it replaces comments with spaces rather
00903             // than removing them completely.
00904             $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00905
00906             // Remove anything after a comment-start token, to guard against
00907             // incorrect client implementations.
00908             $commentPos = strpos( $value, '/*' );
00909             if ( $commentPos !== false ) {
00910                 $value = substr( $value, 0, $commentPos );
00911             }
00912         }
00913
00914         // S followed by repeat, iteration, or prolonged sound marks,
00915         // which IE will treat as "ss"
00916         $value = preg_replace(
00917             '/s(?:
00918                 \xE3\x80\xB1 | # U+3031
00919                 \xE3\x82\x9D | # U+309D
00920                 \xE3\x83\xBC | # U+30FC
00921                 \xE3\x83\xBD | # U+30FD
00922                 \xEF\xB9\xBC | # U+FE7C
00923                 \xEF\xB9\xBD | # U+FE7D
00924                 \xEF\xBD\xB0   # U+FF70
00925             )/ix',
00926             'ss',
00927             $value
00928         );
00929
00930         return $value;
00931     }
00932
00933
00952     static function checkCss( $value ) {
00953         $value = self::normalizeCss( $value );
00954
00955         // Reject problematic keywords and control characters
00956         if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
00957             return '/* invalid control char */';
00958         } elseif ( preg_match(
00959             '! expression
00960                 | filter\s*:
00961                 | accelerator\s*:
00962                 | -o-link\s*:
00963                 | -o-link-source\s*:
00964                 | -o-replace\s*:
00965                 | url\s*\(
00966                 | image\s*\(
00967                 | image-set\s*\(
00968             !ix', $value ) ) {
00969             return '/* insecure input */';
00970         }
00971         return $value;
00972     }
00973
00978     static function cssDecodeCallback( $matches ) {
00979         if ( $matches[1] !== '' ) {
00980             // Line continuation
00981             return '';
00982         } elseif ( $matches[2] !== '' ) {
00983             $char = codepointToUtf8( hexdec( $matches[2] ) );
00984         } elseif ( $matches[3] !== '' ) {
00985             $char = $matches[3];
00986         } else {
00987             $char = '\\';
00988         }
00989         if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00990             // These characters need to be escaped in strings
00991             // Clean up the escape sequence to avoid parsing errors by clients
00992             return '\\' . dechex( ord( $char ) ) . ' ';
00993         } else {
00994             // Decode unnecessary escape
00995             return $char;
00996         }
00997     }
00998
01018     static function fixTagAttributes( $text, $element ) {
01019         if ( trim( $text ) == '' ) {
01020             return '';
01021         }
01022
01023         $decoded = Sanitizer::decodeTagAttributes( $text );
01024         $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
01025
01026         return Sanitizer::safeEncodeTagAttributes( $stripped );
01027     }
01028
01034     static function encodeAttribute( $text ) {
01035         $encValue = htmlspecialchars( $text, ENT_QUOTES );
01036
01037         // Whitespace is normalized during attribute decoding,
01038         // so if we've been passed non-spaces we must encode them
01039         // ahead of time or they won't be preserved.
01040         $encValue = strtr( $encValue, array(
01041             "\n" => '&#10;',
01042             "\r" => '&#13;',
01043             "\t" => '&#9;',
01044         ) );
01045
01046         return $encValue;
01047     }
01048
01055     static function safeEncodeAttribute( $text ) {
01056         $encValue = Sanitizer::encodeAttribute( $text );
01057
01058         # Templates and links may be expanded in later parsing,
01059         # creating invalid or dangerous output. Suppress this.
01060         $encValue = strtr( $encValue, array(
01061             '<'    => '&lt;',   // This should never happen,
01062             '>'    => '&gt;',   // we've received invalid input
01063             '"'    => '&quot;', // which should have been escaped.
01064             '{'    => '&#123;',
01065             '['    => '&#91;',
01066             "''"   => '&#39;&#39;',
01067             'ISBN' => '&#73;SBN',
01068             'RFC'  => '&#82;FC',
01069             'PMID' => '&#80;MID',
01070             '|'    => '&#124;',
01071             '__'   => '&#95;_',
01072         ) );
01073
01074         # Stupid hack
01075         $encValue = preg_replace_callback(
01076             '/((?i)' . wfUrlProtocols() . ')/',
01077             array( 'Sanitizer', 'armorLinksCallback' ),
01078             $encValue );
01079         return $encValue;
01080     }
01081
01113     static function escapeId( $id, $options = array() ) {
01114         global $wgExperimentalHtmlIds;
01115         $options = (array)$options;
01116
01117         $id = Sanitizer::decodeCharReferences( $id );
01118
01119         if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
01120             $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
01121             $id = trim( $id, '_' );
01122             if ( $id === '' ) {
01123                 # Must have been all whitespace to start with.
01124                 return '_';
01125             } else {
01126                 return $id;
01127             }
01128         }
01129
01130         # HTML4-style escaping
01131         static $replace = array(
01132             '%3A' => ':',
01133             '%' => '.'
01134         );
01135
01136         $id = urlencode( strtr( $id, ' ', '_' ) );
01137         $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
01138
01139         if ( !preg_match( '/^[a-zA-Z]/', $id )
01140         && !in_array( 'noninitial', $options ) ) {
01141             // Initial character must be a letter!
01142             $id = "x$id";
01143         }
01144         return $id;
01145     }
01146
01158     static function escapeClass( $class ) {
01159         // Convert ugly stuff to underscores and kill underscores in ugly places
01160         return rtrim( preg_replace(
01161             array( '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ),
01162             '_',
01163             $class ), '_' );
01164     }
01165
01173     static function escapeHtmlAllowEntities( $html ) {
01174         $html = Sanitizer::decodeCharReferences( $html );
01175         # It seems wise to escape ' as well as ", as a matter of course.  Can't
01176         # hurt.
01177         $html = htmlspecialchars( $html, ENT_QUOTES );
01178         return $html;
01179     }
01180
01186     private static function armorLinksCallback( $matches ) {
01187         return str_replace( ':', '&#58;', $matches[1] );
01188     }
01189
01198     public static function decodeTagAttributes( $text ) {
01199         if ( trim( $text ) == '' ) {
01200             return array();
01201         }
01202
01203         $attribs = array();
01204         $pairs = array();
01205         if ( !preg_match_all(
01206             self::getAttribsRegex(),
01207             $text,
01208             $pairs,
01209             PREG_SET_ORDER ) ) {
01210             return $attribs;
01211         }
01212
01213         foreach ( $pairs as $set ) {
01214             $attribute = strtolower( $set[1] );
01215             $value = Sanitizer::getTagAttributeCallback( $set );
01216
01217             // Normalize whitespace
01218             $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
01219             $value = trim( $value );
01220
01221             // Decode character references
01222             $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
01223         }
01224         return $attribs;
01225     }
01226
01234     public static function safeEncodeTagAttributes( $assoc_array ) {
01235         $attribs = array();
01236         foreach ( $assoc_array as $attribute => $value ) {
01237             $encAttribute = htmlspecialchars( $attribute );
01238             $encValue = Sanitizer::safeEncodeAttribute( $value );
01239
01240             $attribs[] = "$encAttribute=\"$encValue\"";
01241         }
01242         return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
01243     }
01244
01253     private static function getTagAttributeCallback( $set ) {
01254         if ( isset( $set[6] ) ) {
01255             # Illegal #XXXXXX color with no quotes.
01256             return $set[6];
01257         } elseif ( isset( $set[5] ) ) {
01258             # No quotes.
01259             return $set[5];
01260         } elseif ( isset( $set[4] ) ) {
01261             # Single-quoted
01262             return $set[4];
01263         } elseif ( isset( $set[3] ) ) {
01264             # Double-quoted
01265             return $set[3];
01266         } elseif ( !isset( $set[2] ) ) {
01267             # In XHTML, attributes must have a value.
01268             # For 'reduced' form, return explicitly the attribute name here.
01269             return $set[1];
01270         } else {
01271             throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
01272         }
01273     }
01274
01287     private static function normalizeAttributeValue( $text ) {
01288         return str_replace( '"', '&quot;',
01289             self::normalizeWhitespace(
01290                 Sanitizer::normalizeCharReferences( $text ) ) );
01291     }
01292
01297     private static function normalizeWhitespace( $text ) {
01298         return preg_replace(
01299             '/\r\n|[\x20\x0d\x0a\x09]/',
01300             ' ',
01301             $text );
01302     }
01303
01312     static function normalizeSectionNameWhitespace( $section ) {
01313         return trim( preg_replace( '/[ _]+/', ' ', $section ) );
01314     }
01315
01331     static function normalizeCharReferences( $text ) {
01332         return preg_replace_callback(
01333             self::CHAR_REFS_REGEX,
01334             array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01335             $text );
01336     }
01337
01342     static function normalizeCharReferencesCallback( $matches ) {
01343         $ret = null;
01344         if ( $matches[1] != '' ) {
01345             $ret = Sanitizer::normalizeEntity( $matches[1] );
01346         } elseif ( $matches[2] != '' ) {
01347             $ret = Sanitizer::decCharReference( $matches[2] );
01348         } elseif ( $matches[3] != '' ) {
01349             $ret = Sanitizer::hexCharReference( $matches[3] );
01350         }
01351         if ( is_null( $ret ) ) {
01352             return htmlspecialchars( $matches[0] );
01353         } else {
01354             return $ret;
01355         }
01356     }
01357
01368     static function normalizeEntity( $name ) {
01369         if ( isset( self::$htmlEntityAliases[$name] ) ) {
01370             return '&' . self::$htmlEntityAliases[$name] . ';';
01371         } elseif ( in_array( $name,
01372         array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
01373             return "&$name;";
01374         } elseif ( isset( self::$htmlEntities[$name] ) ) {
01375             return '&#' . self::$htmlEntities[$name] . ';';
01376         } else {
01377             return "&amp;$name;";
01378         }
01379     }
01380
01385     static function decCharReference( $codepoint ) {
01386         $point = intval( $codepoint );
01387         if ( Sanitizer::validateCodepoint( $point ) ) {
01388             return sprintf( '&#%d;', $point );
01389         } else {
01390             return null;
01391         }
01392     }
01393
01398     static function hexCharReference( $codepoint ) {
01399         $point = hexdec( $codepoint );
01400         if ( Sanitizer::validateCodepoint( $point ) ) {
01401             return sprintf( '&#x%x;', $point );
01402         } else {
01403             return null;
01404         }
01405     }
01406
01412     private static function validateCodepoint( $codepoint ) {
01413         return $codepoint == 0x09
01414             || $codepoint == 0x0a
01415             || $codepoint == 0x0d
01416             || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
01417             || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
01418             || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
01419     }
01420
01428     public static function decodeCharReferences( $text ) {
01429         return preg_replace_callback(
01430             self::CHAR_REFS_REGEX,
01431             array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01432             $text );
01433     }
01434
01445     public static function decodeCharReferencesAndNormalize( $text ) {
01446         global $wgContLang;
01447         $text = preg_replace_callback(
01448             self::CHAR_REFS_REGEX,
01449             array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01450             $text, /* limit */ -1, $count );
01451
01452         if ( $count ) {
01453             return $wgContLang->normalize( $text );
01454         } else {
01455             return $text;
01456         }
01457     }
01458
01463     static function decodeCharReferencesCallback( $matches ) {
01464         if ( $matches[1] != '' ) {
01465             return Sanitizer::decodeEntity( $matches[1] );
01466         } elseif ( $matches[2] != '' ) {
01467             return Sanitizer::decodeChar( intval( $matches[2] ) );
01468         } elseif ( $matches[3] != '' ) {
01469             return Sanitizer::decodeChar( hexdec( $matches[3] ) );
01470         }
01471         # Last case should be an ampersand by itself
01472         return $matches[0];
01473     }
01474
01482     static function decodeChar( $codepoint ) {
01483         if ( Sanitizer::validateCodepoint( $codepoint ) ) {
01484             return codepointToUtf8( $codepoint );
01485         } else {
01486             return UTF8_REPLACEMENT;
01487         }
01488     }
01489
01498     static function decodeEntity( $name ) {
01499         if ( isset( self::$htmlEntityAliases[$name] ) ) {
01500             $name = self::$htmlEntityAliases[$name];
01501         }
01502         if ( isset( self::$htmlEntities[$name] ) ) {
01503             return codepointToUtf8( self::$htmlEntities[$name] );
01504         } else {
01505             return "&$name;";
01506         }
01507     }
01508
01515     static function attributeWhitelist( $element ) {
01516         $list = Sanitizer::setupAttributeWhitelist();
01517         return isset( $list[$element] )
01518             ? $list[$element]
01519             : array();
01520     }
01521
01527     static function setupAttributeWhitelist() {
01528         global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
01529         static $whitelist, $staticInitialised;
01530
01531         $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
01532
01533         if ( $whitelist !== null && $staticInitialised == $globalContext ) {
01534             return $whitelist;
01535         }
01536
01537         $common = array(
01538             # HTML
01539             'id',
01540             'class',
01541             'style',
01542             'lang',
01543             'dir',
01544             'title',
01545
01546             # WAI-ARIA
01547             'role',
01548         );
01549
01550         if ( $wgAllowRdfaAttributes ) {
01551             # RDFa attributes as specified in section 9 of
01552             # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
01553             $common = array_merge( $common, array(
01554                 'about', 'property', 'resource', 'datatype', 'typeof',
01555             ) );
01556         }
01557
01558         if ( $wgAllowMicrodataAttributes ) {
01559             # add HTML5 microdata tags as specified by
01560             # http://www.whatwg.org/html/microdata.html#the-microdata-model
01561             $common = array_merge( $common, array(
01562                 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
01563             ) );
01564         }
01565
01566         $block = array_merge( $common, array( 'align' ) );
01567         $tablealign = array( 'align', 'valign' );
01568         $tablecell = array(
01569             'abbr',
01570             'axis',
01571             'headers',
01572             'scope',
01573             'rowspan',
01574             'colspan',
01575             'nowrap', # deprecated
01576             'width', # deprecated
01577             'height', # deprecated
01578             'bgcolor', # deprecated
01579         );
01580
01581         # Numbers refer to sections in HTML 4.01 standard describing the element.
01582         # See: http://www.w3.org/TR/html4/
01583         $whitelist = array(
01584             # 7.5.4
01585             'div'        => $block,
01586             'center'     => $common, # deprecated
01587             'span'       => $common,
01588
01589             # 7.5.5
01590             'h1'         => $block,
01591             'h2'         => $block,
01592             'h3'         => $block,
01593             'h4'         => $block,
01594             'h5'         => $block,
01595             'h6'         => $block,
01596
01597             # 7.5.6
01598             # address
01599
01600             # 8.2.4
01601             'bdo'        => $common,
01602
01603             # 9.2.1
01604             'em'         => $common,
01605             'strong'     => $common,
01606             'cite'       => $common,
01607             'dfn'        => $common,
01608             'code'       => $common,
01609             'samp'       => $common,
01610             'kbd'        => $common,
01611             'var'        => $common,
01612             'abbr'       => $common,
01613             # acronym
01614
01615             # 9.2.2
01616             'blockquote' => array_merge( $common, array( 'cite' ) ),
01617             'q'          => array_merge( $common, array( 'cite' ) ),
01618
01619             # 9.2.3
01620             'sub'        => $common,
01621             'sup'        => $common,
01622
01623             # 9.3.1
01624             'p'          => $block,
01625
01626             # 9.3.2
01627             'br'         => array_merge( $common, array( 'clear' ) ),
01628
01629             # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
01630             'wbr'        => $common,
01631
01632             # 9.3.4
01633             'pre'        => array_merge( $common, array( 'width' ) ),
01634
01635             # 9.4
01636             'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01637             'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01638
01639             # 10.2
01640             'ul'         => array_merge( $common, array( 'type' ) ),
01641             'ol'         => array_merge( $common, array( 'type', 'start' ) ),
01642             'li'         => array_merge( $common, array( 'type', 'value' ) ),
01643
01644             # 10.3
01645             'dl'         => $common,
01646             'dd'         => $common,
01647             'dt'         => $common,
01648
01649             # 11.2.1
01650             'table'      => array_merge( $common,
01651                                 array( 'summary', 'width', 'border', 'frame',
01652                                         'rules', 'cellspacing', 'cellpadding',
01653                                         'align', 'bgcolor',
01654                                 ) ),
01655
01656             # 11.2.2
01657             'caption'    => $block,
01658
01659             # 11.2.3
01660             'thead'      => $common,
01661             'tfoot'      => $common,
01662             'tbody'      => $common,
01663
01664             # 11.2.4
01665             'colgroup'   => array_merge( $common, array( 'span' ) ),
01666             'col'        => array_merge( $common, array( 'span' ) ),
01667
01668             # 11.2.5
01669             'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01670
01671             # 11.2.6
01672             'td'         => array_merge( $common, $tablecell, $tablealign ),
01673             'th'         => array_merge( $common, $tablecell, $tablealign ),
01674
01675             # 12.2
01676             # NOTE: <a> is not allowed directly, but the attrib
01677             # whitelist is used from the Parser object
01678             'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
01679
01680             # 13.2
01681             # Not usually allowed, but may be used for extension-style hooks
01682             # such as <math> when it is rasterized, or if $wgAllowImageTag is
01683             # true
01684             'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
01685
01686             # 15.2.1
01687             'tt'         => $common,
01688             'b'          => $common,
01689             'i'          => $common,
01690             'big'        => $common,
01691             'small'      => $common,
01692             'strike'     => $common,
01693             's'          => $common,
01694             'u'          => $common,
01695
01696             # 15.2.2
01697             'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
01698             # basefont
01699
01700             # 15.3
01701             'hr'         => array_merge( $common, array( 'width' ) ),
01702
01703             # HTML Ruby annotation text module, simple ruby only.
01704             # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
01705             'ruby'       => $common,
01706             # rbc
01707             'rb'         => $common,
01708             'rp'         => $common,
01709             'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
01710             'rtc'         => $common,
01711
01712             # MathML root element, where used for extensions
01713             # 'title' may not be 100% valid here; it's XHTML
01714             # http://www.w3.org/TR/REC-MathML/
01715             'math'       => array( 'class', 'style', 'id', 'title' ),
01716
01717             # HTML 5 section 4.6
01718             'bdi' => $common,
01719
01720             # HTML5 elements, defined by:
01721             # http://www.whatwg.org/html/
01722             'data' => array_merge( $common, array( 'value' ) ),
01723             'time' => array_merge( $common, array( 'datetime' ) ),
01724             'mark' => $common,
01725
01726             // meta and link are only permitted by removeHTMLtags when Microdata
01727             // is enabled so we don't bother adding a conditional to hide these
01728             // Also meta and link are only valid in WikiText as Microdata elements
01729             // (ie: validateTag rejects tags missing the attributes needed for Microdata)
01730             // So we don't bother including $common attributes that have no purpose.
01731             'meta' => array( 'itemprop', 'content' ),
01732             'link' => array( 'itemprop', 'href' ),
01733         );
01734
01735         $staticInitialised = $globalContext;
01736
01737         return $whitelist;
01738     }
01739
01750     static function stripAllTags( $text ) {
01751         # Actual <tags>
01752         $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01753
01754         # Normalize &entities and whitespace
01755         $text = self::decodeCharReferences( $text );
01756         $text = self::normalizeWhitespace( $text );
01757
01758         return $text;
01759     }
01760
01770     static function hackDocType() {
01771         $out = "<!DOCTYPE html [\n";
01772         foreach ( self::$htmlEntities as $entity => $codepoint ) {
01773             $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01774         }
01775         $out .= "]>\n";
01776         return $out;
01777     }
01778
01783     static function cleanUrl( $url ) {
01784         # Normalize any HTML entities in input. They will be
01785         # re-escaped by makeExternalLink().
01786         $url = Sanitizer::decodeCharReferences( $url );
01787
01788         # Escape any control characters introduced by the above step
01789         $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
01790             array( __CLASS__, 'cleanUrlCallback' ), $url );
01791
01792         # Validate hostname portion
01793         $matches = array();
01794         if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01795             list( /* $whole */, $protocol, $host, $rest ) = $matches;
01796
01797             // Characters that will be ignored in IDNs.
01798             // http://tools.ietf.org/html/3454#section-3.1
01799             // Strip them before further processing so blacklists and such work.
01800             $strip = "/
01801                 \\s|          # general whitespace
01802                 \xc2\xad|     # 00ad SOFT HYPHEN
01803                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01804                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01805                 \xe2\x81\xa0| # 2060 WORD JOINER
01806                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01807                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
01808                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01809                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01810                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01811                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01812                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01813                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
01814                 /xuD";
01815
01816             $host = preg_replace( $strip, '', $host );
01817
01818             // @todo FIXME: Validate hostnames here
01819
01820             return $protocol . $host . $rest;
01821         } else {
01822             return $url;
01823         }
01824     }
01825
01830     static function cleanUrlCallback( $matches ) {
01831         return urlencode( $matches[0] );
01832     }
01833
01862     public static function validateEmail( $addr ) {
01863         $result = null;
01864         if ( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
01865             return $result;
01866         }
01867
01868         // Please note strings below are enclosed in brackets [], this make the
01869         // hyphen "-" a range indicator. Hence it is double backslashed below.
01870         // See bug 26948
01871         $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
01872         $rfc1034_ldh_str = "a-z0-9\\-";
01873
01874         $html5_email_regexp = "/
01875         ^                      # start of string
01876         [$rfc5322_atext\\.]+    # user part which is liberal :p
01877         @                      # 'apostrophe'
01878         [$rfc1034_ldh_str]+       # First domain part
01879         (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
01880         $                      # End of string
01881         /ix"; // case Insensitive, eXtended
01882
01883         return (bool)preg_match( $html5_email_regexp, $addr );
01884     }
01885 }