[ Index ] |
PHP Cross Reference of MediaWiki-1.24.0 |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Simulation of Microsoft Internet Explorer's MIME type detection algorithm. 4 * 5 * @file 6 * @todo Define the exact license of this file. 7 */ 8 9 /** 10 * This class simulates Microsoft Internet Explorer's terribly broken and 11 * insecure MIME type detection algorithm. It can be used to check web uploads 12 * with an apparently safe type, to see if IE will reinterpret them to produce 13 * something dangerous. 14 * 15 * It is full of bugs and strange design choices should not under any 16 * circumstances be used to determine a MIME type to present to a user or 17 * client. (Apple Safari developers, this means you too.) 18 * 19 * This class is based on a disassembly of IE 5.0, 6.0 and 7.0. Although I have 20 * attempted to ensure that this code works in exactly the same way as Internet 21 * Explorer, it does not share any source code, or creative choices such as 22 * variable names, thus I (Tim Starling) claim copyright on it. 23 * 24 * It may be redistributed without restriction. To aid reuse, this class does 25 * not depend on any MediaWiki module. 26 */ 27 class IEContentAnalyzer { 28 /** 29 * Relevant data taken from the type table in IE 5 30 */ 31 protected $baseTypeTable = array( 32 'ambiguous' /*1*/ => array( 33 'text/plain', 34 'application/octet-stream', 35 'application/x-netcdf', // [sic] 36 ), 37 'text' /*3*/ => array( 38 'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64', 39 'application/macbinhex40', 'application/x-cdf', 'text/scriptlet' 40 ), 41 'binary' /*4*/ => array( 42 'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif', 43 'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp', 44 'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi', 45 'video/x-msvideo', 'video/mpeg', 'application/x-compressed', 46 'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java', 47 'application/x-msdownload' 48 ), 49 'html' /*5*/ => array( 'text/html' ), 50 ); 51 52 /** 53 * Changes to the type table in later versions of IE 54 */ 55 protected $addedTypes = array( 56 'ie07' => array( 57 'text' => array( 'text/xml', 'application/xml' ) 58 ), 59 ); 60 61 /** 62 * An approximation of the "Content Type" values in HKEY_CLASSES_ROOT in a 63 * typical Windows installation. 64 * 65 * Used for extension to MIME type mapping if detection fails. 66 */ 67 protected $registry = array( 68 '.323' => 'text/h323', 69 '.3g2' => 'video/3gpp2', 70 '.3gp' => 'video/3gpp', 71 '.3gp2' => 'video/3gpp2', 72 '.3gpp' => 'video/3gpp', 73 '.aac' => 'audio/aac', 74 '.ac3' => 'audio/ac3', 75 '.accda' => 'application/msaccess', 76 '.accdb' => 'application/msaccess', 77 '.accdc' => 'application/msaccess', 78 '.accde' => 'application/msaccess', 79 '.accdr' => 'application/msaccess', 80 '.accdt' => 'application/msaccess', 81 '.ade' => 'application/msaccess', 82 '.adp' => 'application/msaccess', 83 '.adts' => 'audio/aac', 84 '.ai' => 'application/postscript', 85 '.aif' => 'audio/aiff', 86 '.aifc' => 'audio/aiff', 87 '.aiff' => 'audio/aiff', 88 '.amc' => 'application/x-mpeg', 89 '.application' => 'application/x-ms-application', 90 '.asf' => 'video/x-ms-asf', 91 '.asx' => 'video/x-ms-asf', 92 '.au' => 'audio/basic', 93 '.avi' => 'video/avi', 94 '.bmp' => 'image/bmp', 95 '.caf' => 'audio/x-caf', 96 '.cat' => 'application/vnd.ms-pki.seccat', 97 '.cbo' => 'application/sha', 98 '.cdda' => 'audio/aiff', 99 '.cer' => 'application/x-x509-ca-cert', 100 '.conf' => 'text/plain', 101 '.crl' => 'application/pkix-crl', 102 '.crt' => 'application/x-x509-ca-cert', 103 '.css' => 'text/css', 104 '.csv' => 'application/vnd.ms-excel', 105 '.der' => 'application/x-x509-ca-cert', 106 '.dib' => 'image/bmp', 107 '.dif' => 'video/x-dv', 108 '.dll' => 'application/x-msdownload', 109 '.doc' => 'application/msword', 110 '.docm' => 'application/vnd.ms-word.document.macroEnabled.12', 111 '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 112 '.dot' => 'application/msword', 113 '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12', 114 '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template', 115 '.dv' => 'video/x-dv', 116 '.dwfx' => 'model/vnd.dwfx+xps', 117 '.edn' => 'application/vnd.adobe.edn', 118 '.eml' => 'message/rfc822', 119 '.eps' => 'application/postscript', 120 '.etd' => 'application/x-ebx', 121 '.exe' => 'application/x-msdownload', 122 '.fdf' => 'application/vnd.fdf', 123 '.fif' => 'application/fractals', 124 '.gif' => 'image/gif', 125 '.gsm' => 'audio/x-gsm', 126 '.hqx' => 'application/mac-binhex40', 127 '.hta' => 'application/hta', 128 '.htc' => 'text/x-component', 129 '.htm' => 'text/html', 130 '.html' => 'text/html', 131 '.htt' => 'text/webviewhtml', 132 '.hxa' => 'application/xml', 133 '.hxc' => 'application/xml', 134 '.hxd' => 'application/octet-stream', 135 '.hxe' => 'application/xml', 136 '.hxf' => 'application/xml', 137 '.hxh' => 'application/octet-stream', 138 '.hxi' => 'application/octet-stream', 139 '.hxk' => 'application/xml', 140 '.hxq' => 'application/octet-stream', 141 '.hxr' => 'application/octet-stream', 142 '.hxs' => 'application/octet-stream', 143 '.hxt' => 'application/xml', 144 '.hxv' => 'application/xml', 145 '.hxw' => 'application/octet-stream', 146 '.ico' => 'image/x-icon', 147 '.iii' => 'application/x-iphone', 148 '.ins' => 'application/x-internet-signup', 149 '.iqy' => 'text/x-ms-iqy', 150 '.isp' => 'application/x-internet-signup', 151 '.jfif' => 'image/jpeg', 152 '.jnlp' => 'application/x-java-jnlp-file', 153 '.jpe' => 'image/jpeg', 154 '.jpeg' => 'image/jpeg', 155 '.jpg' => 'image/jpeg', 156 '.jtx' => 'application/x-jtx+xps', 157 '.latex' => 'application/x-latex', 158 '.log' => 'text/plain', 159 '.m1v' => 'video/mpeg', 160 '.m2v' => 'video/mpeg', 161 '.m3u' => 'audio/x-mpegurl', 162 '.mac' => 'image/x-macpaint', 163 '.man' => 'application/x-troff-man', 164 '.mda' => 'application/msaccess', 165 '.mdb' => 'application/msaccess', 166 '.mde' => 'application/msaccess', 167 '.mfp' => 'application/x-shockwave-flash', 168 '.mht' => 'message/rfc822', 169 '.mhtml' => 'message/rfc822', 170 '.mid' => 'audio/mid', 171 '.midi' => 'audio/mid', 172 '.mod' => 'video/mpeg', 173 '.mov' => 'video/quicktime', 174 '.mp2' => 'video/mpeg', 175 '.mp2v' => 'video/mpeg', 176 '.mp3' => 'audio/mpeg', 177 '.mp4' => 'video/mp4', 178 '.mpa' => 'video/mpeg', 179 '.mpe' => 'video/mpeg', 180 '.mpeg' => 'video/mpeg', 181 '.mpf' => 'application/vnd.ms-mediapackage', 182 '.mpg' => 'video/mpeg', 183 '.mpv2' => 'video/mpeg', 184 '.mqv' => 'video/quicktime', 185 '.NMW' => 'application/nmwb', 186 '.nws' => 'message/rfc822', 187 '.odc' => 'text/x-ms-odc', 188 '.ols' => 'application/vnd.ms-publisher', 189 '.p10' => 'application/pkcs10', 190 '.p12' => 'application/x-pkcs12', 191 '.p7b' => 'application/x-pkcs7-certificates', 192 '.p7c' => 'application/pkcs7-mime', 193 '.p7m' => 'application/pkcs7-mime', 194 '.p7r' => 'application/x-pkcs7-certreqresp', 195 '.p7s' => 'application/pkcs7-signature', 196 '.pct' => 'image/pict', 197 '.pdf' => 'application/pdf', 198 '.pdx' => 'application/vnd.adobe.pdx', 199 '.pfx' => 'application/x-pkcs12', 200 '.pic' => 'image/pict', 201 '.pict' => 'image/pict', 202 '.pinstall' => 'application/x-picasa-detect', 203 '.pko' => 'application/vnd.ms-pki.pko', 204 '.png' => 'image/png', 205 '.pnt' => 'image/x-macpaint', 206 '.pntg' => 'image/x-macpaint', 207 '.pot' => 'application/vnd.ms-powerpoint', 208 '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12', 209 '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template', 210 '.ppa' => 'application/vnd.ms-powerpoint', 211 '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12', 212 '.pps' => 'application/vnd.ms-powerpoint', 213 '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12', 214 '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', 215 '.ppt' => 'application/vnd.ms-powerpoint', 216 '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12', 217 '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 218 '.prf' => 'application/pics-rules', 219 '.ps' => 'application/postscript', 220 '.pub' => 'application/vnd.ms-publisher', 221 '.pwz' => 'application/vnd.ms-powerpoint', 222 '.py' => 'text/plain', 223 '.pyw' => 'text/plain', 224 '.qht' => 'text/x-html-insertion', 225 '.qhtm' => 'text/x-html-insertion', 226 '.qt' => 'video/quicktime', 227 '.qti' => 'image/x-quicktime', 228 '.qtif' => 'image/x-quicktime', 229 '.qtl' => 'application/x-quicktimeplayer', 230 '.rat' => 'application/rat-file', 231 '.rmf' => 'application/vnd.adobe.rmf', 232 '.rmi' => 'audio/mid', 233 '.rqy' => 'text/x-ms-rqy', 234 '.rtf' => 'application/msword', 235 '.sct' => 'text/scriptlet', 236 '.sd2' => 'audio/x-sd2', 237 '.sdp' => 'application/sdp', 238 '.shtml' => 'text/html', 239 '.sit' => 'application/x-stuffit', 240 '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12', 241 '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide', 242 '.slk' => 'application/vnd.ms-excel', 243 '.snd' => 'audio/basic', 244 '.so' => 'application/x-apachemodule', 245 '.sol' => 'text/plain', 246 '.sor' => 'text/plain', 247 '.spc' => 'application/x-pkcs7-certificates', 248 '.spl' => 'application/futuresplash', 249 '.sst' => 'application/vnd.ms-pki.certstore', 250 '.stl' => 'application/vnd.ms-pki.stl', 251 '.swf' => 'application/x-shockwave-flash', 252 '.thmx' => 'application/vnd.ms-officetheme', 253 '.tif' => 'image/tiff', 254 '.tiff' => 'image/tiff', 255 '.txt' => 'text/plain', 256 '.uls' => 'text/iuls', 257 '.vcf' => 'text/x-vcard', 258 '.vdx' => 'application/vnd.ms-visio.viewer', 259 '.vsd' => 'application/vnd.ms-visio.viewer', 260 '.vss' => 'application/vnd.ms-visio.viewer', 261 '.vst' => 'application/vnd.ms-visio.viewer', 262 '.vsx' => 'application/vnd.ms-visio.viewer', 263 '.vtx' => 'application/vnd.ms-visio.viewer', 264 '.wav' => 'audio/wav', 265 '.wax' => 'audio/x-ms-wax', 266 '.wbk' => 'application/msword', 267 '.wdp' => 'image/vnd.ms-photo', 268 '.wiz' => 'application/msword', 269 '.wm' => 'video/x-ms-wm', 270 '.wma' => 'audio/x-ms-wma', 271 '.wmd' => 'application/x-ms-wmd', 272 '.wmv' => 'video/x-ms-wmv', 273 '.wmx' => 'video/x-ms-wmx', 274 '.wmz' => 'application/x-ms-wmz', 275 '.wpl' => 'application/vnd.ms-wpl', 276 '.wsc' => 'text/scriptlet', 277 '.wvx' => 'video/x-ms-wvx', 278 '.xaml' => 'application/xaml+xml', 279 '.xbap' => 'application/x-ms-xbap', 280 '.xdp' => 'application/vnd.adobe.xdp+xml', 281 '.xfdf' => 'application/vnd.adobe.xfdf', 282 '.xht' => 'application/xhtml+xml', 283 '.xhtml' => 'application/xhtml+xml', 284 '.xla' => 'application/vnd.ms-excel', 285 '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12', 286 '.xlk' => 'application/vnd.ms-excel', 287 '.xll' => 'application/vnd.ms-excel', 288 '.xlm' => 'application/vnd.ms-excel', 289 '.xls' => 'application/vnd.ms-excel', 290 '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12', 291 '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12', 292 '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 293 '.xlt' => 'application/vnd.ms-excel', 294 '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12', 295 '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template', 296 '.xlw' => 'application/vnd.ms-excel', 297 '.xml' => 'text/xml', 298 '.xps' => 'application/vnd.ms-xpsdocument', 299 '.xsl' => 'text/xml', 300 ); 301 302 /** 303 * IE versions which have been analysed to bring you this class, and for 304 * which some substantive difference exists. These will appear as keys 305 * in the return value of getRealMimesFromData(). The names are chosen to sort correctly. 306 */ 307 protected $versions = array( 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' ); 308 309 /** 310 * Type table with versions expanded 311 */ 312 protected $typeTable = array(); 313 314 /** constructor */ 315 function __construct() { 316 // Construct versioned type arrays from the base type array plus additions 317 $types = $this->baseTypeTable; 318 foreach ( $this->versions as $version ) { 319 if ( isset( $this->addedTypes[$version] ) ) { 320 foreach ( $this->addedTypes[$version] as $format => $addedTypes ) { 321 $types[$format] = array_merge( $types[$format], $addedTypes ); 322 } 323 } 324 $this->typeTable[$version] = $types; 325 } 326 } 327 328 /** 329 * Get the MIME types from getMimesFromData(), but convert the result from IE's 330 * idiosyncratic private types into something other apps will understand. 331 * 332 * @param string $fileName the file name (unused at present) 333 * @param string $chunk the first 256 bytes of the file 334 * @param string $proposed the MIME type proposed by the server 335 * 336 * @return Array: map of IE version to detected MIME type 337 */ 338 public function getRealMimesFromData( $fileName, $chunk, $proposed ) { 339 $types = $this->getMimesFromData( $fileName, $chunk, $proposed ); 340 $types = array_map( array( $this, 'translateMimeType' ), $types ); 341 return $types; 342 } 343 344 /** 345 * Translate a MIME type from IE's idiosyncratic private types into 346 * more commonly understood type strings 347 * @param $type 348 * @return string 349 */ 350 public function translateMimeType( $type ) { 351 static $table = array( 352 'image/pjpeg' => 'image/jpeg', 353 'image/x-png' => 'image/png', 354 'image/x-wmf' => 'application/x-msmetafile', 355 'image/bmp' => 'image/x-bmp', 356 'application/x-zip-compressed' => 'application/zip', 357 'application/x-compressed' => 'application/x-compress', 358 'application/x-gzip-compressed' => 'application/x-gzip', 359 'audio/mid' => 'audio/midi', 360 ); 361 if ( isset( $table[$type] ) ) { 362 $type = $table[$type]; 363 } 364 return $type; 365 } 366 367 /** 368 * Get the untranslated MIME types for all known versions 369 * 370 * @param string $fileName the file name (unused at present) 371 * @param string $chunk the first 256 bytes of the file 372 * @param string $proposed the MIME type proposed by the server 373 * 374 * @return Array: map of IE version to detected MIME type 375 */ 376 public function getMimesFromData( $fileName, $chunk, $proposed ) { 377 $types = array(); 378 foreach ( $this->versions as $version ) { 379 $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ); 380 } 381 return $types; 382 } 383 384 /** 385 * Get the MIME type for a given named version 386 * @param $version 387 * @param $fileName 388 * @param $chunk 389 * @param $proposed 390 * @return bool|string 391 */ 392 protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) { 393 // Strip text after a semicolon 394 $semiPos = strpos( $proposed, ';' ); 395 if ( $semiPos !== false ) { 396 $proposed = substr( $proposed, 0, $semiPos ); 397 } 398 399 $proposedFormat = $this->getDataFormat( $version, $proposed ); 400 if ( $proposedFormat == 'unknown' 401 && $proposed != 'multipart/mixed' 402 && $proposed != 'multipart/x-mixed-replace' ) 403 { 404 return $proposed; 405 } 406 if ( strval( $chunk ) === '' ) { 407 return $proposed; 408 } 409 410 // Truncate chunk at 255 bytes 411 $chunk = substr( $chunk, 0, 255 ); 412 413 // IE does the Check*Headers() calls last, and instead does the following image 414 // type checks by directly looking for the magic numbers. What I do here should 415 // have the same effect since the magic number checks are identical in both cases. 416 $result = $this->sampleData( $version, $chunk ); 417 $sampleFound = $result['found']; 418 $counters = $result['counters']; 419 $binaryType = $this->checkBinaryHeaders( $version, $chunk ); 420 $textType = $this->checkTextHeaders( $version, $chunk ); 421 422 if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) { 423 return 'text/html'; 424 } 425 if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) { 426 return 'image/gif'; 427 } 428 if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' ) 429 && $binaryType == 'image/pjpeg' ) 430 { 431 return $proposed; 432 } 433 // PNG check added in IE 7 434 if ( $version >= 'ie07' 435 && ( $proposed == 'image/x-png' || $proposed == 'image/png' ) 436 && $binaryType == 'image/x-png' ) 437 { 438 return $proposed; 439 } 440 441 // CDF was removed in IE 7 so it won't be in $sampleFound for later versions 442 if ( isset( $sampleFound['cdf'] ) ) { 443 return 'application/x-cdf'; 444 } 445 446 // RSS and Atom were added in IE 7 so they won't be in $sampleFound for 447 // previous versions 448 if ( isset( $sampleFound['rss'] ) ) { 449 return 'application/rss+xml'; 450 } 451 if ( isset( $sampleFound['rdf-tag'] ) 452 && isset( $sampleFound['rdf-url'] ) 453 && isset( $sampleFound['rdf-purl'] ) ) 454 { 455 return 'application/rss+xml'; 456 } 457 if ( isset( $sampleFound['atom'] ) ) { 458 return 'application/atom+xml'; 459 } 460 461 if ( isset( $sampleFound['xml'] ) ) { 462 // TODO: I'm not sure under what circumstances this flag is enabled 463 if ( strpos( $version, 'strict' ) !== false ) { 464 if ( $proposed == 'text/html' || $proposed == 'text/xml' ) { 465 return 'text/xml'; 466 } 467 } else { 468 return 'text/xml'; 469 } 470 } 471 if ( isset( $sampleFound['html'] ) ) { 472 // TODO: I'm not sure under what circumstances this flag is enabled 473 if ( strpos( $version, 'nohtml' ) !== false ) { 474 if ( $proposed == 'text/plain' ) { 475 return 'text/html'; 476 } 477 } else { 478 return 'text/html'; 479 } 480 } 481 if ( isset( $sampleFound['xbm'] ) ) { 482 return 'image/x-bitmap'; 483 } 484 if ( isset( $sampleFound['binhex'] ) ) { 485 return 'application/macbinhex40'; 486 } 487 if ( isset( $sampleFound['scriptlet'] ) ) { 488 if ( strpos( $version, 'strict' ) !== false ) { 489 if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) { 490 return 'text/scriptlet'; 491 } 492 } else { 493 return 'text/scriptlet'; 494 } 495 } 496 497 // Freaky heuristics to determine if the data is text or binary 498 // The heuristic is of course broken for non-ASCII text 499 if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] ) 500 < ( $counters['ctrl'] + $counters['high'] ) * 16 ) 501 { 502 $kindOfBinary = true; 503 $type = $binaryType ? $binaryType : $textType; 504 if ( $type === false ) { 505 $type = 'application/octet-stream'; 506 } 507 } else { 508 $kindOfBinary = false; 509 $type = $textType ? $textType : $binaryType; 510 if ( $type === false ) { 511 $type = 'text/plain'; 512 } 513 } 514 515 // Check if the output format is ambiguous 516 // This generally means that detection failed, real types aren't ambiguous 517 $detectedFormat = $this->getDataFormat( $version, $type ); 518 if ( $detectedFormat != 'ambiguous' ) { 519 return $type; 520 } 521 522 if ( $proposedFormat != 'ambiguous' ) { 523 // FormatAgreesWithData() 524 if ( $proposedFormat == 'text' && !$kindOfBinary ) { 525 return $proposed; 526 } 527 if ( $proposedFormat == 'binary' && $kindOfBinary ) { 528 return $proposed; 529 } 530 if ( $proposedFormat == 'html' ) { 531 return $proposed; 532 } 533 } 534 535 // Find a MIME type by searching the registry for the file extension. 536 $dotPos = strrpos( $fileName, '.' ); 537 if ( $dotPos === false ) { 538 return $type; 539 } 540 $ext = substr( $fileName, $dotPos ); 541 if ( isset( $this->registry[$ext] ) ) { 542 return $this->registry[$ext]; 543 } 544 545 // TODO: If the extension has an application registered to it, IE will return 546 // application/octet-stream. We'll skip that, so we could erroneously 547 // return text/plain or application/x-netcdf where application/octet-stream 548 // would be correct. 549 550 return $type; 551 } 552 553 /** 554 * Check for text headers at the start of the chunk 555 * Confirmed same in 5 and 7. 556 * @param $version 557 * @param $chunk 558 * @return bool|string 559 */ 560 private function checkTextHeaders( $version, $chunk ) { 561 $chunk2 = substr( $chunk, 0, 2 ); 562 $chunk4 = substr( $chunk, 0, 4 ); 563 $chunk5 = substr( $chunk, 0, 5 ); 564 if ( $chunk4 == '%PDF' ) { 565 return 'application/pdf'; 566 } 567 if ( $chunk2 == '%!' ) { 568 return 'application/postscript'; 569 } 570 if ( $chunk5 == '{\\rtf' ) { 571 return 'text/richtext'; 572 } 573 if ( $chunk5 == 'begin' ) { 574 return 'application/base64'; 575 } 576 return false; 577 } 578 579 /** 580 * Check for binary headers at the start of the chunk 581 * Confirmed same in 5 and 7. 582 * @param $version 583 * @param $chunk 584 * @return bool|string 585 */ 586 private function checkBinaryHeaders( $version, $chunk ) { 587 $chunk2 = substr( $chunk, 0, 2 ); 588 $chunk3 = substr( $chunk, 0, 3 ); 589 $chunk4 = substr( $chunk, 0, 4 ); 590 $chunk5 = substr( $chunk, 0, 5 ); 591 $chunk5uc = strtoupper( $chunk5 ); 592 $chunk8 = substr( $chunk, 0, 8 ); 593 if ( $chunk5uc == 'GIF87' || $chunk5uc == 'GIF89' ) { 594 return 'image/gif'; 595 } 596 if ( $chunk2 == "\xff\xd8" ) { 597 return 'image/pjpeg'; // actually plain JPEG but this is what IE returns 598 } 599 600 if ( $chunk2 == 'BM' 601 && substr( $chunk, 6, 2 ) == "\000\000" 602 && substr( $chunk, 8, 2 ) == "\000\000" ) 603 { 604 return 'image/bmp'; // another non-standard MIME 605 } 606 if ( $chunk4 == 'RIFF' 607 && substr( $chunk, 8, 4 ) == 'WAVE' ) 608 { 609 return 'audio/wav'; 610 } 611 // These were integer literals in IE 612 // Perhaps the author was not sure what the target endianness was 613 if ( $chunk4 == ".sd\000" 614 || $chunk4 == ".snd" 615 || $chunk4 == "\000ds." 616 || $chunk4 == "dns." ) 617 { 618 return 'audio/basic'; 619 } 620 if ( $chunk3 == "MM\000" ) { 621 return 'image/tiff'; 622 } 623 if ( $chunk2 == 'MZ' ) { 624 return 'application/x-msdownload'; 625 } 626 if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) { 627 return 'image/x-png'; // [sic] 628 } 629 if ( strlen( $chunk ) >= 5 ) { 630 $byte2 = ord( $chunk[2] ); 631 $byte4 = ord( $chunk[4] ); 632 if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) { 633 return 'image/x-jg'; 634 } 635 } 636 // More endian confusion? 637 if ( $chunk4 == 'MROF' ) { 638 return 'audio/x-aiff'; 639 } 640 $chunk4_8 = substr( $chunk, 8, 4 ); 641 if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) { 642 return 'audio/x-aiff'; 643 } 644 if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) { 645 return 'video/avi'; 646 } 647 if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) { 648 return 'video/mpeg'; 649 } 650 if ( $chunk4 == "\001\000\000\000" 651 && substr( $chunk, 40, 4 ) == ' EMF' ) 652 { 653 return 'image/x-emf'; 654 } 655 if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) { 656 return 'image/x-wmf'; 657 } 658 if ( $chunk4 == "\xca\xfe\xba\xbe" ) { 659 return 'application/java'; 660 } 661 if ( $chunk2 == 'PK' ) { 662 return 'application/x-zip-compressed'; 663 } 664 if ( $chunk2 == "\x1f\x9d" ) { 665 return 'application/x-compressed'; 666 } 667 if ( $chunk2 == "\x1f\x8b" ) { 668 return 'application/x-gzip-compressed'; 669 } 670 // Skip redundant check for ZIP 671 if ( $chunk5 == "MThd\000" ) { 672 return 'audio/mid'; 673 } 674 if ( $chunk4 == '%PDF' ) { 675 return 'application/pdf'; 676 } 677 return false; 678 } 679 680 /** 681 * Do heuristic checks on the bulk of the data sample. 682 * Search for HTML tags. 683 * @param $version 684 * @param $chunk 685 * @return array 686 */ 687 protected function sampleData( $version, $chunk ) { 688 $found = array(); 689 $counters = array( 690 'ctrl' => 0, 691 'high' => 0, 692 'low' => 0, 693 'lf' => 0, 694 'cr' => 0, 695 'ff' => 0 696 ); 697 $htmlTags = array( 698 'html', 699 'head', 700 'title', 701 'body', 702 'script', 703 'a href', 704 'pre', 705 'img', 706 'plaintext', 707 'table' 708 ); 709 $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'; 710 $rdfPurl = 'http://purl.org/rss/1.0/'; 711 $xbmMagic1 = '#define'; 712 $xbmMagic2 = '_width'; 713 $xbmMagic3 = '_bits'; 714 $binhexMagic = 'converted with BinHex'; 715 $chunkLength = strlen( $chunk ); 716 717 for ( $offset = 0; $offset < $chunkLength; $offset++ ) { 718 $curChar = $chunk[$offset]; 719 if ( $curChar == "\x0a" ) { 720 $counters['lf']++; 721 continue; 722 } elseif ( $curChar == "\x0d" ) { 723 $counters['cr']++; 724 continue; 725 } elseif ( $curChar == "\x0c" ) { 726 $counters['ff']++; 727 continue; 728 } elseif ( $curChar == "\t" ) { 729 $counters['low']++; 730 continue; 731 } elseif ( ord( $curChar ) < 32 ) { 732 $counters['ctrl']++; 733 continue; 734 } elseif ( ord( $curChar ) >= 128 ) { 735 $counters['high']++; 736 continue; 737 } 738 739 $counters['low']++; 740 if ( $curChar == '<' ) { 741 // XML 742 $remainder = substr( $chunk, $offset + 1 ); 743 if ( !strncasecmp( $remainder, '?XML', 4 ) ) { 744 $nextChar = substr( $chunk, $offset + 5, 1 ); 745 if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) { 746 $found['xml'] = true; 747 } 748 } 749 // Scriptlet (JSP) 750 if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) { 751 $found['scriptlet'] = true; 752 break; 753 } 754 // HTML 755 foreach ( $htmlTags as $tag ) { 756 if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) { 757 $found['html'] = true; 758 } 759 } 760 // Skip broken check for additional tags (HR etc.) 761 762 // CHANNEL replaced by RSS, RDF and FEED in IE 7 763 if ( $version < 'ie07' ) { 764 if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) { 765 $found['cdf'] = true; 766 } 767 } else { 768 // RSS 769 if ( !strncasecmp( $remainder, 'RSS', 3 ) ) { 770 $found['rss'] = true; 771 break; // return from SampleData 772 } 773 if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) { 774 $found['rdf-tag'] = true; 775 // no break 776 } 777 if ( !strncasecmp( $remainder, 'FEED', 4 ) ) { 778 $found['atom'] = true; 779 break; 780 } 781 } 782 continue; 783 } 784 // Skip broken check for --> 785 786 // RSS URL checks 787 // For some reason both URLs must appear before it is recognised 788 $remainder = substr( $chunk, $offset ); 789 if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) { 790 $found['rdf-url'] = true; 791 if ( isset( $found['rdf-tag'] ) 792 && isset( $found['rdf-purl'] ) ) // [sic] 793 { 794 break; 795 } 796 continue; 797 } 798 799 if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) { 800 if ( isset( $found['rdf-tag'] ) 801 && isset( $found['rdf-url'] ) ) // [sic] 802 { 803 break; 804 } 805 continue; 806 } 807 808 // XBM checks 809 if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) { 810 $found['xbm1'] = true; 811 continue; 812 } 813 if ( $curChar == '_' ) { 814 if ( isset( $found['xbm2'] ) ) { 815 if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) { 816 $found['xbm'] = true; 817 break; 818 } 819 } elseif ( isset( $found['xbm1'] ) ) { 820 if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) { 821 $found['xbm2'] = true; 822 } 823 } 824 } 825 826 // BinHex 827 if ( !strncmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) { 828 $found['binhex'] = true; 829 } 830 } 831 return array( 'found' => $found, 'counters' => $counters ); 832 } 833 834 /** 835 * @param $version 836 * @param $type 837 * @return int|string 838 */ 839 protected function getDataFormat( $version, $type ) { 840 $types = $this->typeTable[$version]; 841 if ( $type == '(null)' || strval( $type ) === '' ) { 842 return 'ambiguous'; 843 } 844 foreach ( $types as $format => $list ) { 845 if ( in_array( $type, $list ) ) { 846 return $format; 847 } 848 } 849 return 'unknown'; 850 } 851 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Fri Nov 28 14:03:12 2014 | Cross-referenced by PHPXref 0.7.1 |