[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/includes/libs/ -> IEContentAnalyzer.php (source)

   1  <?php
   2  /**
   3   * Simulation of Microsoft Internet Explorer's MIME type detection algorithm.
   4   *
   5   * @file
   6   * @todo Define the exact license of this file.
   7   */
   8  
   9  /**
  10   * This class simulates Microsoft Internet Explorer's terribly broken and
  11   * insecure MIME type detection algorithm. It can be used to check web uploads
  12   * with an apparently safe type, to see if IE will reinterpret them to produce
  13   * something dangerous.
  14   *
  15   * It is full of bugs and strange design choices should not under any
  16   * circumstances be used to determine a MIME type to present to a user or
  17   * client. (Apple Safari developers, this means you too.)
  18   *
  19   * This class is based on a disassembly of IE 5.0, 6.0 and 7.0. Although I have
  20   * attempted to ensure that this code works in exactly the same way as Internet
  21   * Explorer, it does not share any source code, or creative choices such as
  22   * variable names, thus I (Tim Starling) claim copyright on it.
  23   *
  24   * It may be redistributed without restriction. To aid reuse, this class does
  25   * not depend on any MediaWiki module.
  26   */
  27  class IEContentAnalyzer {
  28      /**
  29       * Relevant data taken from the type table in IE 5
  30       */
  31      protected $baseTypeTable = array(
  32          'ambiguous' /*1*/ => array(
  33              'text/plain',
  34              'application/octet-stream',
  35              'application/x-netcdf', // [sic]
  36          ),
  37          'text' /*3*/ => array(
  38              'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64',
  39              'application/macbinhex40', 'application/x-cdf', 'text/scriptlet'
  40          ),
  41          'binary' /*4*/ => array(
  42              'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif',
  43              'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp',
  44              'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi',
  45              'video/x-msvideo', 'video/mpeg', 'application/x-compressed',
  46              'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java',
  47              'application/x-msdownload'
  48          ),
  49          'html' /*5*/ => array( 'text/html' ),
  50      );
  51  
  52      /**
  53       * Changes to the type table in later versions of IE
  54       */
  55      protected $addedTypes = array(
  56          'ie07' => array(
  57              'text' => array( 'text/xml', 'application/xml' )
  58          ),
  59      );
  60  
  61      /**
  62       * An approximation of the "Content Type" values in HKEY_CLASSES_ROOT in a
  63       * typical Windows installation.
  64       *
  65       * Used for extension to MIME type mapping if detection fails.
  66       */
  67      protected $registry = array(
  68          '.323' => 'text/h323',
  69          '.3g2' => 'video/3gpp2',
  70          '.3gp' => 'video/3gpp',
  71          '.3gp2' => 'video/3gpp2',
  72          '.3gpp' => 'video/3gpp',
  73          '.aac' => 'audio/aac',
  74          '.ac3' => 'audio/ac3',
  75          '.accda' => 'application/msaccess',
  76          '.accdb' => 'application/msaccess',
  77          '.accdc' => 'application/msaccess',
  78          '.accde' => 'application/msaccess',
  79          '.accdr' => 'application/msaccess',
  80          '.accdt' => 'application/msaccess',
  81          '.ade' => 'application/msaccess',
  82          '.adp' => 'application/msaccess',
  83          '.adts' => 'audio/aac',
  84          '.ai' => 'application/postscript',
  85          '.aif' => 'audio/aiff',
  86          '.aifc' => 'audio/aiff',
  87          '.aiff' => 'audio/aiff',
  88          '.amc' => 'application/x-mpeg',
  89          '.application' => 'application/x-ms-application',
  90          '.asf' => 'video/x-ms-asf',
  91          '.asx' => 'video/x-ms-asf',
  92          '.au' => 'audio/basic',
  93          '.avi' => 'video/avi',
  94          '.bmp' => 'image/bmp',
  95          '.caf' => 'audio/x-caf',
  96          '.cat' => 'application/vnd.ms-pki.seccat',
  97          '.cbo' => 'application/sha',
  98          '.cdda' => 'audio/aiff',
  99          '.cer' => 'application/x-x509-ca-cert',
 100          '.conf' => 'text/plain',
 101          '.crl' => 'application/pkix-crl',
 102          '.crt' => 'application/x-x509-ca-cert',
 103          '.css' => 'text/css',
 104          '.csv' => 'application/vnd.ms-excel',
 105          '.der' => 'application/x-x509-ca-cert',
 106          '.dib' => 'image/bmp',
 107          '.dif' => 'video/x-dv',
 108          '.dll' => 'application/x-msdownload',
 109          '.doc' => 'application/msword',
 110          '.docm' => 'application/vnd.ms-word.document.macroEnabled.12',
 111          '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 112          '.dot' => 'application/msword',
 113          '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12',
 114          '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
 115          '.dv' => 'video/x-dv',
 116          '.dwfx' => 'model/vnd.dwfx+xps',
 117          '.edn' => 'application/vnd.adobe.edn',
 118          '.eml' => 'message/rfc822',
 119          '.eps' => 'application/postscript',
 120          '.etd' => 'application/x-ebx',
 121          '.exe' => 'application/x-msdownload',
 122          '.fdf' => 'application/vnd.fdf',
 123          '.fif' => 'application/fractals',
 124          '.gif' => 'image/gif',
 125          '.gsm' => 'audio/x-gsm',
 126          '.hqx' => 'application/mac-binhex40',
 127          '.hta' => 'application/hta',
 128          '.htc' => 'text/x-component',
 129          '.htm' => 'text/html',
 130          '.html' => 'text/html',
 131          '.htt' => 'text/webviewhtml',
 132          '.hxa' => 'application/xml',
 133          '.hxc' => 'application/xml',
 134          '.hxd' => 'application/octet-stream',
 135          '.hxe' => 'application/xml',
 136          '.hxf' => 'application/xml',
 137          '.hxh' => 'application/octet-stream',
 138          '.hxi' => 'application/octet-stream',
 139          '.hxk' => 'application/xml',
 140          '.hxq' => 'application/octet-stream',
 141          '.hxr' => 'application/octet-stream',
 142          '.hxs' => 'application/octet-stream',
 143          '.hxt' => 'application/xml',
 144          '.hxv' => 'application/xml',
 145          '.hxw' => 'application/octet-stream',
 146          '.ico' => 'image/x-icon',
 147          '.iii' => 'application/x-iphone',
 148          '.ins' => 'application/x-internet-signup',
 149          '.iqy' => 'text/x-ms-iqy',
 150          '.isp' => 'application/x-internet-signup',
 151          '.jfif' => 'image/jpeg',
 152          '.jnlp' => 'application/x-java-jnlp-file',
 153          '.jpe' => 'image/jpeg',
 154          '.jpeg' => 'image/jpeg',
 155          '.jpg' => 'image/jpeg',
 156          '.jtx' => 'application/x-jtx+xps',
 157          '.latex' => 'application/x-latex',
 158          '.log' => 'text/plain',
 159          '.m1v' => 'video/mpeg',
 160          '.m2v' => 'video/mpeg',
 161          '.m3u' => 'audio/x-mpegurl',
 162          '.mac' => 'image/x-macpaint',
 163          '.man' => 'application/x-troff-man',
 164          '.mda' => 'application/msaccess',
 165          '.mdb' => 'application/msaccess',
 166          '.mde' => 'application/msaccess',
 167          '.mfp' => 'application/x-shockwave-flash',
 168          '.mht' => 'message/rfc822',
 169          '.mhtml' => 'message/rfc822',
 170          '.mid' => 'audio/mid',
 171          '.midi' => 'audio/mid',
 172          '.mod' => 'video/mpeg',
 173          '.mov' => 'video/quicktime',
 174          '.mp2' => 'video/mpeg',
 175          '.mp2v' => 'video/mpeg',
 176          '.mp3' => 'audio/mpeg',
 177          '.mp4' => 'video/mp4',
 178          '.mpa' => 'video/mpeg',
 179          '.mpe' => 'video/mpeg',
 180          '.mpeg' => 'video/mpeg',
 181          '.mpf' => 'application/vnd.ms-mediapackage',
 182          '.mpg' => 'video/mpeg',
 183          '.mpv2' => 'video/mpeg',
 184          '.mqv' => 'video/quicktime',
 185          '.NMW' => 'application/nmwb',
 186          '.nws' => 'message/rfc822',
 187          '.odc' => 'text/x-ms-odc',
 188          '.ols' => 'application/vnd.ms-publisher',
 189          '.p10' => 'application/pkcs10',
 190          '.p12' => 'application/x-pkcs12',
 191          '.p7b' => 'application/x-pkcs7-certificates',
 192          '.p7c' => 'application/pkcs7-mime',
 193          '.p7m' => 'application/pkcs7-mime',
 194          '.p7r' => 'application/x-pkcs7-certreqresp',
 195          '.p7s' => 'application/pkcs7-signature',
 196          '.pct' => 'image/pict',
 197          '.pdf' => 'application/pdf',
 198          '.pdx' => 'application/vnd.adobe.pdx',
 199          '.pfx' => 'application/x-pkcs12',
 200          '.pic' => 'image/pict',
 201          '.pict' => 'image/pict',
 202          '.pinstall' => 'application/x-picasa-detect',
 203          '.pko' => 'application/vnd.ms-pki.pko',
 204          '.png' => 'image/png',
 205          '.pnt' => 'image/x-macpaint',
 206          '.pntg' => 'image/x-macpaint',
 207          '.pot' => 'application/vnd.ms-powerpoint',
 208          '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12',
 209          '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template',
 210          '.ppa' => 'application/vnd.ms-powerpoint',
 211          '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12',
 212          '.pps' => 'application/vnd.ms-powerpoint',
 213          '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12',
 214          '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
 215          '.ppt' => 'application/vnd.ms-powerpoint',
 216          '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12',
 217          '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
 218          '.prf' => 'application/pics-rules',
 219          '.ps' => 'application/postscript',
 220          '.pub' => 'application/vnd.ms-publisher',
 221          '.pwz' => 'application/vnd.ms-powerpoint',
 222          '.py' => 'text/plain',
 223          '.pyw' => 'text/plain',
 224          '.qht' => 'text/x-html-insertion',
 225          '.qhtm' => 'text/x-html-insertion',
 226          '.qt' => 'video/quicktime',
 227          '.qti' => 'image/x-quicktime',
 228          '.qtif' => 'image/x-quicktime',
 229          '.qtl' => 'application/x-quicktimeplayer',
 230          '.rat' => 'application/rat-file',
 231          '.rmf' => 'application/vnd.adobe.rmf',
 232          '.rmi' => 'audio/mid',
 233          '.rqy' => 'text/x-ms-rqy',
 234          '.rtf' => 'application/msword',
 235          '.sct' => 'text/scriptlet',
 236          '.sd2' => 'audio/x-sd2',
 237          '.sdp' => 'application/sdp',
 238          '.shtml' => 'text/html',
 239          '.sit' => 'application/x-stuffit',
 240          '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12',
 241          '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide',
 242          '.slk' => 'application/vnd.ms-excel',
 243          '.snd' => 'audio/basic',
 244          '.so' => 'application/x-apachemodule',
 245          '.sol' => 'text/plain',
 246          '.sor' => 'text/plain',
 247          '.spc' => 'application/x-pkcs7-certificates',
 248          '.spl' => 'application/futuresplash',
 249          '.sst' => 'application/vnd.ms-pki.certstore',
 250          '.stl' => 'application/vnd.ms-pki.stl',
 251          '.swf' => 'application/x-shockwave-flash',
 252          '.thmx' => 'application/vnd.ms-officetheme',
 253          '.tif' => 'image/tiff',
 254          '.tiff' => 'image/tiff',
 255          '.txt' => 'text/plain',
 256          '.uls' => 'text/iuls',
 257          '.vcf' => 'text/x-vcard',
 258          '.vdx' => 'application/vnd.ms-visio.viewer',
 259          '.vsd' => 'application/vnd.ms-visio.viewer',
 260          '.vss' => 'application/vnd.ms-visio.viewer',
 261          '.vst' => 'application/vnd.ms-visio.viewer',
 262          '.vsx' => 'application/vnd.ms-visio.viewer',
 263          '.vtx' => 'application/vnd.ms-visio.viewer',
 264          '.wav' => 'audio/wav',
 265          '.wax' => 'audio/x-ms-wax',
 266          '.wbk' => 'application/msword',
 267          '.wdp' => 'image/vnd.ms-photo',
 268          '.wiz' => 'application/msword',
 269          '.wm' => 'video/x-ms-wm',
 270          '.wma' => 'audio/x-ms-wma',
 271          '.wmd' => 'application/x-ms-wmd',
 272          '.wmv' => 'video/x-ms-wmv',
 273          '.wmx' => 'video/x-ms-wmx',
 274          '.wmz' => 'application/x-ms-wmz',
 275          '.wpl' => 'application/vnd.ms-wpl',
 276          '.wsc' => 'text/scriptlet',
 277          '.wvx' => 'video/x-ms-wvx',
 278          '.xaml' => 'application/xaml+xml',
 279          '.xbap' => 'application/x-ms-xbap',
 280          '.xdp' => 'application/vnd.adobe.xdp+xml',
 281          '.xfdf' => 'application/vnd.adobe.xfdf',
 282          '.xht' => 'application/xhtml+xml',
 283          '.xhtml' => 'application/xhtml+xml',
 284          '.xla' => 'application/vnd.ms-excel',
 285          '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12',
 286          '.xlk' => 'application/vnd.ms-excel',
 287          '.xll' => 'application/vnd.ms-excel',
 288          '.xlm' => 'application/vnd.ms-excel',
 289          '.xls' => 'application/vnd.ms-excel',
 290          '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
 291          '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12',
 292          '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
 293          '.xlt' => 'application/vnd.ms-excel',
 294          '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12',
 295          '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
 296          '.xlw' => 'application/vnd.ms-excel',
 297          '.xml' => 'text/xml',
 298          '.xps' => 'application/vnd.ms-xpsdocument',
 299          '.xsl' => 'text/xml',
 300      );
 301  
 302      /**
 303       * IE versions which have been analysed to bring you this class, and for
 304       * which some substantive difference exists. These will appear as keys
 305       * in the return value of getRealMimesFromData(). The names are chosen to sort correctly.
 306       */
 307      protected $versions = array( 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' );
 308  
 309      /**
 310       * Type table with versions expanded
 311       */
 312      protected $typeTable = array();
 313  
 314      /** constructor */
 315  	function __construct() {
 316          // Construct versioned type arrays from the base type array plus additions
 317          $types = $this->baseTypeTable;
 318          foreach ( $this->versions as $version ) {
 319              if ( isset( $this->addedTypes[$version] ) ) {
 320                  foreach ( $this->addedTypes[$version] as $format => $addedTypes ) {
 321                      $types[$format] = array_merge( $types[$format], $addedTypes );
 322                  }
 323              }
 324              $this->typeTable[$version] = $types;
 325          }
 326      }
 327  
 328      /**
 329       * Get the MIME types from getMimesFromData(), but convert the result from IE's
 330       * idiosyncratic private types into something other apps will understand.
 331       *
 332       * @param string $fileName the file name (unused at present)
 333       * @param string $chunk the first 256 bytes of the file
 334       * @param string $proposed the MIME type proposed by the server
 335       *
 336       * @return Array: map of IE version to detected MIME type
 337       */
 338  	public function getRealMimesFromData( $fileName, $chunk, $proposed ) {
 339          $types = $this->getMimesFromData( $fileName, $chunk, $proposed );
 340          $types = array_map( array( $this, 'translateMimeType' ), $types );
 341          return $types;
 342      }
 343  
 344      /**
 345       * Translate a MIME type from IE's idiosyncratic private types into
 346       * more commonly understood type strings
 347       * @param $type
 348       * @return string
 349       */
 350  	public function translateMimeType( $type ) {
 351          static $table = array(
 352              'image/pjpeg' => 'image/jpeg',
 353              'image/x-png' => 'image/png',
 354              'image/x-wmf' => 'application/x-msmetafile',
 355              'image/bmp' => 'image/x-bmp',
 356              'application/x-zip-compressed' => 'application/zip',
 357              'application/x-compressed' => 'application/x-compress',
 358              'application/x-gzip-compressed' => 'application/x-gzip',
 359              'audio/mid' => 'audio/midi',
 360          );
 361          if ( isset( $table[$type] ) ) {
 362              $type = $table[$type];
 363          }
 364          return $type;
 365      }
 366  
 367      /**
 368       * Get the untranslated MIME types for all known versions
 369       *
 370       * @param string $fileName the file name (unused at present)
 371       * @param string $chunk the first 256 bytes of the file
 372       * @param string $proposed the MIME type proposed by the server
 373       *
 374       * @return Array: map of IE version to detected MIME type
 375       */
 376  	public function getMimesFromData( $fileName, $chunk, $proposed ) {
 377          $types = array();
 378          foreach ( $this->versions as $version ) {
 379              $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed );
 380          }
 381          return $types;
 382      }
 383  
 384      /**
 385       * Get the MIME type for a given named version
 386       * @param $version
 387       * @param $fileName
 388       * @param $chunk
 389       * @param $proposed
 390       * @return bool|string
 391       */
 392  	protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) {
 393          // Strip text after a semicolon
 394          $semiPos = strpos( $proposed, ';' );
 395          if ( $semiPos !== false ) {
 396              $proposed = substr( $proposed, 0, $semiPos );
 397          }
 398  
 399          $proposedFormat = $this->getDataFormat( $version, $proposed );
 400          if ( $proposedFormat == 'unknown'
 401              && $proposed != 'multipart/mixed'
 402              && $proposed != 'multipart/x-mixed-replace' )
 403          {
 404              return $proposed;
 405          }
 406          if ( strval( $chunk ) === '' ) {
 407              return $proposed;
 408          }
 409  
 410          // Truncate chunk at 255 bytes
 411          $chunk = substr( $chunk, 0, 255 );
 412  
 413          // IE does the Check*Headers() calls last, and instead does the following image
 414          // type checks by directly looking for the magic numbers. What I do here should
 415          // have the same effect since the magic number checks are identical in both cases.
 416          $result = $this->sampleData( $version, $chunk );
 417          $sampleFound = $result['found'];
 418          $counters = $result['counters'];
 419          $binaryType = $this->checkBinaryHeaders( $version, $chunk );
 420          $textType = $this->checkTextHeaders( $version, $chunk );
 421  
 422          if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) {
 423              return 'text/html';
 424          }
 425          if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) {
 426              return 'image/gif';
 427          }
 428          if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' )
 429              && $binaryType == 'image/pjpeg' )
 430          {
 431              return $proposed;
 432          }
 433          // PNG check added in IE 7
 434          if ( $version >= 'ie07'
 435              && ( $proposed == 'image/x-png' || $proposed == 'image/png' )
 436              && $binaryType == 'image/x-png' )
 437          {
 438              return $proposed;
 439          }
 440  
 441          // CDF was removed in IE 7 so it won't be in $sampleFound for later versions
 442          if ( isset( $sampleFound['cdf'] ) ) {
 443              return 'application/x-cdf';
 444          }
 445  
 446          // RSS and Atom were added in IE 7 so they won't be in $sampleFound for
 447          // previous versions
 448          if ( isset( $sampleFound['rss'] ) ) {
 449              return 'application/rss+xml';
 450          }
 451          if ( isset( $sampleFound['rdf-tag'] )
 452              && isset( $sampleFound['rdf-url'] )
 453              && isset( $sampleFound['rdf-purl'] ) )
 454          {
 455              return 'application/rss+xml';
 456          }
 457          if ( isset( $sampleFound['atom'] ) ) {
 458              return 'application/atom+xml';
 459          }
 460  
 461          if ( isset( $sampleFound['xml'] ) ) {
 462              // TODO: I'm not sure under what circumstances this flag is enabled
 463              if ( strpos( $version, 'strict' ) !== false ) {
 464                  if ( $proposed == 'text/html' || $proposed == 'text/xml' ) {
 465                      return 'text/xml';
 466                  }
 467              } else {
 468                  return 'text/xml';
 469              }
 470          }
 471          if ( isset( $sampleFound['html'] ) ) {
 472              // TODO: I'm not sure under what circumstances this flag is enabled
 473              if ( strpos( $version, 'nohtml' ) !== false ) {
 474                  if ( $proposed == 'text/plain' ) {
 475                      return 'text/html';
 476                  }
 477              } else {
 478                  return 'text/html';
 479              }
 480          }
 481          if ( isset( $sampleFound['xbm'] ) ) {
 482              return 'image/x-bitmap';
 483          }
 484          if ( isset( $sampleFound['binhex'] ) ) {
 485              return 'application/macbinhex40';
 486          }
 487          if ( isset( $sampleFound['scriptlet'] ) ) {
 488              if ( strpos( $version, 'strict' ) !== false ) {
 489                  if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) {
 490                      return 'text/scriptlet';
 491                  }
 492              } else {
 493                  return 'text/scriptlet';
 494              }
 495          }
 496  
 497          // Freaky heuristics to determine if the data is text or binary
 498          // The heuristic is of course broken for non-ASCII text
 499          if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] )
 500              < ( $counters['ctrl'] + $counters['high'] ) * 16 )
 501          {
 502              $kindOfBinary = true;
 503              $type = $binaryType ? $binaryType : $textType;
 504              if ( $type === false ) {
 505                  $type = 'application/octet-stream';
 506              }
 507          } else {
 508              $kindOfBinary = false;
 509              $type = $textType ? $textType : $binaryType;
 510              if ( $type === false ) {
 511                  $type = 'text/plain';
 512              }
 513          }
 514  
 515          // Check if the output format is ambiguous
 516          // This generally means that detection failed, real types aren't ambiguous
 517          $detectedFormat = $this->getDataFormat( $version, $type );
 518          if ( $detectedFormat != 'ambiguous' ) {
 519              return $type;
 520          }
 521  
 522          if ( $proposedFormat != 'ambiguous' ) {
 523              // FormatAgreesWithData()
 524              if ( $proposedFormat == 'text' && !$kindOfBinary ) {
 525                  return $proposed;
 526              }
 527              if ( $proposedFormat == 'binary' && $kindOfBinary ) {
 528                  return $proposed;
 529              }
 530              if ( $proposedFormat == 'html' ) {
 531                  return $proposed;
 532              }
 533          }
 534  
 535          // Find a MIME type by searching the registry for the file extension.
 536          $dotPos = strrpos( $fileName, '.' );
 537          if ( $dotPos === false ) {
 538              return $type;
 539          }
 540          $ext = substr( $fileName, $dotPos );
 541          if ( isset( $this->registry[$ext] ) ) {
 542              return $this->registry[$ext];
 543          }
 544  
 545          // TODO: If the extension has an application registered to it, IE will return
 546          // application/octet-stream. We'll skip that, so we could erroneously
 547          // return text/plain or application/x-netcdf where application/octet-stream
 548          // would be correct.
 549  
 550          return $type;
 551      }
 552  
 553      /**
 554       * Check for text headers at the start of the chunk
 555       * Confirmed same in 5 and 7.
 556       * @param $version
 557       * @param $chunk
 558       * @return bool|string
 559       */
 560  	private function checkTextHeaders( $version, $chunk ) {
 561          $chunk2 = substr( $chunk, 0, 2 );
 562          $chunk4 = substr( $chunk, 0, 4 );
 563          $chunk5 = substr( $chunk, 0, 5 );
 564          if ( $chunk4 == '%PDF' ) {
 565              return 'application/pdf';
 566          }
 567          if ( $chunk2 == '%!' ) {
 568              return 'application/postscript';
 569          }
 570          if ( $chunk5 == '{\\rtf' ) {
 571              return 'text/richtext';
 572          }
 573          if ( $chunk5 == 'begin' ) {
 574              return 'application/base64';
 575          }
 576          return false;
 577      }
 578  
 579      /**
 580       * Check for binary headers at the start of the chunk
 581       * Confirmed same in 5 and 7.
 582       * @param $version
 583       * @param $chunk
 584       * @return bool|string
 585       */
 586  	private function checkBinaryHeaders( $version, $chunk ) {
 587          $chunk2 = substr( $chunk, 0, 2 );
 588          $chunk3 = substr( $chunk, 0, 3 );
 589          $chunk4 = substr( $chunk, 0, 4 );
 590          $chunk5 = substr( $chunk, 0, 5 );
 591          $chunk5uc = strtoupper( $chunk5 );
 592          $chunk8 = substr( $chunk, 0, 8 );
 593          if ( $chunk5uc == 'GIF87' || $chunk5uc == 'GIF89' ) {
 594              return 'image/gif';
 595          }
 596          if ( $chunk2 == "\xff\xd8" ) {
 597              return 'image/pjpeg'; // actually plain JPEG but this is what IE returns
 598          }
 599  
 600          if ( $chunk2 == 'BM'
 601              && substr( $chunk, 6, 2 ) == "\000\000"
 602              && substr( $chunk, 8, 2 ) == "\000\000" )
 603          {
 604              return 'image/bmp'; // another non-standard MIME
 605          }
 606          if ( $chunk4 == 'RIFF'
 607              && substr( $chunk, 8, 4 ) == 'WAVE' )
 608          {
 609              return 'audio/wav';
 610          }
 611          // These were integer literals in IE
 612          // Perhaps the author was not sure what the target endianness was
 613          if ( $chunk4 == ".sd\000"
 614              || $chunk4 == ".snd"
 615              || $chunk4 == "\000ds."
 616              || $chunk4 == "dns." )
 617          {
 618              return 'audio/basic';
 619          }
 620          if ( $chunk3 == "MM\000" ) {
 621              return 'image/tiff';
 622          }
 623          if ( $chunk2 == 'MZ' ) {
 624              return 'application/x-msdownload';
 625          }
 626          if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) {
 627              return 'image/x-png'; // [sic]
 628          }
 629          if ( strlen( $chunk ) >= 5 ) {
 630              $byte2 = ord( $chunk[2] );
 631              $byte4 = ord( $chunk[4] );
 632              if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) {
 633                  return 'image/x-jg';
 634              }
 635          }
 636          // More endian confusion?
 637          if ( $chunk4 == 'MROF' ) {
 638              return 'audio/x-aiff';
 639          }
 640          $chunk4_8 = substr( $chunk, 8, 4 );
 641          if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) {
 642              return 'audio/x-aiff';
 643          }
 644          if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) {
 645              return 'video/avi';
 646          }
 647          if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) {
 648              return 'video/mpeg';
 649          }
 650          if ( $chunk4 == "\001\000\000\000"
 651              && substr( $chunk, 40, 4 ) == ' EMF' )
 652          {
 653              return 'image/x-emf';
 654          }
 655          if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) {
 656              return 'image/x-wmf';
 657          }
 658          if ( $chunk4 == "\xca\xfe\xba\xbe" ) {
 659              return 'application/java';
 660          }
 661          if ( $chunk2 == 'PK' ) {
 662              return 'application/x-zip-compressed';
 663          }
 664          if ( $chunk2 == "\x1f\x9d" ) {
 665              return 'application/x-compressed';
 666          }
 667          if ( $chunk2 == "\x1f\x8b" ) {
 668              return 'application/x-gzip-compressed';
 669          }
 670          // Skip redundant check for ZIP
 671          if ( $chunk5 == "MThd\000" ) {
 672              return 'audio/mid';
 673          }
 674          if ( $chunk4 == '%PDF' ) {
 675              return 'application/pdf';
 676          }
 677          return false;
 678      }
 679  
 680      /**
 681       * Do heuristic checks on the bulk of the data sample.
 682       * Search for HTML tags.
 683       * @param $version
 684       * @param $chunk
 685       * @return array
 686       */
 687  	protected function sampleData( $version, $chunk ) {
 688          $found = array();
 689          $counters = array(
 690              'ctrl' => 0,
 691              'high' => 0,
 692              'low' => 0,
 693              'lf' => 0,
 694              'cr' => 0,
 695              'ff' => 0
 696          );
 697          $htmlTags = array(
 698              'html',
 699              'head',
 700              'title',
 701              'body',
 702              'script',
 703              'a href',
 704              'pre',
 705              'img',
 706              'plaintext',
 707              'table'
 708          );
 709          $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
 710          $rdfPurl = 'http://purl.org/rss/1.0/';
 711          $xbmMagic1 = '#define';
 712          $xbmMagic2 = '_width';
 713          $xbmMagic3 = '_bits';
 714          $binhexMagic = 'converted with BinHex';
 715          $chunkLength = strlen( $chunk );
 716  
 717          for ( $offset = 0; $offset < $chunkLength; $offset++ ) {
 718              $curChar = $chunk[$offset];
 719              if ( $curChar == "\x0a" ) {
 720                  $counters['lf']++;
 721                  continue;
 722              } elseif ( $curChar == "\x0d" ) {
 723                  $counters['cr']++;
 724                  continue;
 725              } elseif ( $curChar == "\x0c" ) {
 726                  $counters['ff']++;
 727                  continue;
 728              } elseif ( $curChar == "\t" ) {
 729                  $counters['low']++;
 730                  continue;
 731              } elseif ( ord( $curChar ) < 32 ) {
 732                  $counters['ctrl']++;
 733                  continue;
 734              } elseif ( ord( $curChar ) >= 128 ) {
 735                  $counters['high']++;
 736                  continue;
 737              }
 738  
 739              $counters['low']++;
 740              if ( $curChar == '<' ) {
 741                  // XML
 742                  $remainder = substr( $chunk, $offset + 1 );
 743                  if ( !strncasecmp( $remainder, '?XML', 4 ) ) {
 744                      $nextChar = substr( $chunk, $offset + 5, 1 );
 745                      if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) {
 746                          $found['xml'] = true;
 747                      }
 748                  }
 749                  // Scriptlet (JSP)
 750                  if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) {
 751                      $found['scriptlet'] = true;
 752                      break;
 753                  }
 754                  // HTML
 755                  foreach ( $htmlTags as $tag ) {
 756                      if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) {
 757                          $found['html'] = true;
 758                      }
 759                  }
 760                  // Skip broken check for additional tags (HR etc.)
 761  
 762                  // CHANNEL replaced by RSS, RDF and FEED in IE 7
 763                  if ( $version < 'ie07' ) {
 764                      if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) {
 765                          $found['cdf'] = true;
 766                      }
 767                  } else {
 768                      // RSS
 769                      if ( !strncasecmp( $remainder, 'RSS', 3 ) ) {
 770                          $found['rss'] = true;
 771                          break; // return from SampleData
 772                      }
 773                      if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) {
 774                          $found['rdf-tag'] = true;
 775                          // no break
 776                      }
 777                      if ( !strncasecmp( $remainder, 'FEED', 4 ) ) {
 778                          $found['atom'] = true;
 779                          break;
 780                      }
 781                  }
 782                  continue;
 783              }
 784              // Skip broken check for -->
 785  
 786              // RSS URL checks
 787              // For some reason both URLs must appear before it is recognised
 788              $remainder = substr( $chunk, $offset );
 789              if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) {
 790                  $found['rdf-url'] = true;
 791                  if ( isset( $found['rdf-tag'] )
 792                      && isset( $found['rdf-purl'] ) ) // [sic]
 793                  {
 794                      break;
 795                  }
 796                  continue;
 797              }
 798  
 799              if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) {
 800                  if ( isset( $found['rdf-tag'] )
 801                      && isset( $found['rdf-url'] ) ) // [sic]
 802                  {
 803                      break;
 804                  }
 805                  continue;
 806              }
 807  
 808              // XBM checks
 809              if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) {
 810                  $found['xbm1'] = true;
 811                  continue;
 812              }
 813              if ( $curChar == '_' ) {
 814                  if ( isset( $found['xbm2'] ) ) {
 815                      if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) {
 816                          $found['xbm'] = true;
 817                          break;
 818                      }
 819                  } elseif ( isset( $found['xbm1'] ) ) {
 820                      if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) {
 821                          $found['xbm2'] = true;
 822                      }
 823                  }
 824              }
 825  
 826              // BinHex
 827              if ( !strncmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) {
 828                  $found['binhex'] = true;
 829              }
 830          }
 831          return array( 'found' => $found, 'counters' => $counters );
 832      }
 833  
 834      /**
 835       * @param $version
 836       * @param $type
 837       * @return int|string
 838       */
 839  	protected function getDataFormat( $version, $type ) {
 840          $types = $this->typeTable[$version];
 841          if ( $type == '(null)' || strval( $type ) === '' ) {
 842              return 'ambiguous';
 843          }
 844          foreach ( $types as $format => $list ) {
 845              if ( in_array( $type, $list ) ) {
 846                  return $format;
 847              }
 848          }
 849          return 'unknown';
 850      }
 851  }


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1