MediaWiki
REL1_19
|
00001 <?php 00002 00021 class IEContentAnalyzer { 00025 protected $baseTypeTable = array( 00026 'ambiguous' /*1*/ => array( 00027 'text/plain', 00028 'application/octet-stream', 00029 'application/x-netcdf', // [sic] 00030 ), 00031 'text' /*3*/ => array( 00032 'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64', 00033 'application/macbinhex40', 'application/x-cdf', 'text/scriptlet' 00034 ), 00035 'binary' /*4*/ => array( 00036 'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif', 00037 'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp', 00038 'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi', 00039 'video/x-msvideo', 'video/mpeg', 'application/x-compressed', 00040 'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java', 00041 'application/x-msdownload' 00042 ), 00043 'html' /*5*/ => array( 'text/html' ), 00044 ); 00045 00049 protected $addedTypes = array( 00050 'ie07' => array( 00051 'text' => array( 'text/xml', 'application/xml' ) 00052 ), 00053 ); 00054 00061 protected $registry = array( 00062 '.323' => 'text/h323', 00063 '.3g2' => 'video/3gpp2', 00064 '.3gp' => 'video/3gpp', 00065 '.3gp2' => 'video/3gpp2', 00066 '.3gpp' => 'video/3gpp', 00067 '.aac' => 'audio/aac', 00068 '.ac3' => 'audio/ac3', 00069 '.accda' => 'application/msaccess', 00070 '.accdb' => 'application/msaccess', 00071 '.accdc' => 'application/msaccess', 00072 '.accde' => 'application/msaccess', 00073 '.accdr' => 'application/msaccess', 00074 '.accdt' => 'application/msaccess', 00075 '.ade' => 'application/msaccess', 00076 '.adp' => 'application/msaccess', 00077 '.adts' => 'audio/aac', 00078 '.ai' => 'application/postscript', 00079 '.aif' => 'audio/aiff', 00080 '.aifc' => 'audio/aiff', 00081 '.aiff' => 'audio/aiff', 00082 '.amc' => 'application/x-mpeg', 00083 '.application' => 'application/x-ms-application', 00084 '.asf' => 'video/x-ms-asf', 00085 '.asx' => 'video/x-ms-asf', 00086 '.au' => 'audio/basic', 00087 '.avi' => 'video/avi', 00088 '.bmp' => 'image/bmp', 00089 '.caf' => 'audio/x-caf', 00090 '.cat' => 'application/vnd.ms-pki.seccat', 00091 '.cbo' => 'application/sha', 00092 '.cdda' => 'audio/aiff', 00093 '.cer' => 'application/x-x509-ca-cert', 00094 '.conf' => 'text/plain', 00095 '.crl' => 'application/pkix-crl', 00096 '.crt' => 'application/x-x509-ca-cert', 00097 '.css' => 'text/css', 00098 '.csv' => 'application/vnd.ms-excel', 00099 '.der' => 'application/x-x509-ca-cert', 00100 '.dib' => 'image/bmp', 00101 '.dif' => 'video/x-dv', 00102 '.dll' => 'application/x-msdownload', 00103 '.doc' => 'application/msword', 00104 '.docm' => 'application/vnd.ms-word.document.macroEnabled.12', 00105 '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 00106 '.dot' => 'application/msword', 00107 '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12', 00108 '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template', 00109 '.dv' => 'video/x-dv', 00110 '.dwfx' => 'model/vnd.dwfx+xps', 00111 '.edn' => 'application/vnd.adobe.edn', 00112 '.eml' => 'message/rfc822', 00113 '.eps' => 'application/postscript', 00114 '.etd' => 'application/x-ebx', 00115 '.exe' => 'application/x-msdownload', 00116 '.fdf' => 'application/vnd.fdf', 00117 '.fif' => 'application/fractals', 00118 '.gif' => 'image/gif', 00119 '.gsm' => 'audio/x-gsm', 00120 '.hqx' => 'application/mac-binhex40', 00121 '.hta' => 'application/hta', 00122 '.htc' => 'text/x-component', 00123 '.htm' => 'text/html', 00124 '.html' => 'text/html', 00125 '.htt' => 'text/webviewhtml', 00126 '.hxa' => 'application/xml', 00127 '.hxc' => 'application/xml', 00128 '.hxd' => 'application/octet-stream', 00129 '.hxe' => 'application/xml', 00130 '.hxf' => 'application/xml', 00131 '.hxh' => 'application/octet-stream', 00132 '.hxi' => 'application/octet-stream', 00133 '.hxk' => 'application/xml', 00134 '.hxq' => 'application/octet-stream', 00135 '.hxr' => 'application/octet-stream', 00136 '.hxs' => 'application/octet-stream', 00137 '.hxt' => 'application/xml', 00138 '.hxv' => 'application/xml', 00139 '.hxw' => 'application/octet-stream', 00140 '.ico' => 'image/x-icon', 00141 '.iii' => 'application/x-iphone', 00142 '.ins' => 'application/x-internet-signup', 00143 '.iqy' => 'text/x-ms-iqy', 00144 '.isp' => 'application/x-internet-signup', 00145 '.jfif' => 'image/jpeg', 00146 '.jnlp' => 'application/x-java-jnlp-file', 00147 '.jpe' => 'image/jpeg', 00148 '.jpeg' => 'image/jpeg', 00149 '.jpg' => 'image/jpeg', 00150 '.jtx' => 'application/x-jtx+xps', 00151 '.latex' => 'application/x-latex', 00152 '.log' => 'text/plain', 00153 '.m1v' => 'video/mpeg', 00154 '.m2v' => 'video/mpeg', 00155 '.m3u' => 'audio/x-mpegurl', 00156 '.mac' => 'image/x-macpaint', 00157 '.man' => 'application/x-troff-man', 00158 '.mda' => 'application/msaccess', 00159 '.mdb' => 'application/msaccess', 00160 '.mde' => 'application/msaccess', 00161 '.mfp' => 'application/x-shockwave-flash', 00162 '.mht' => 'message/rfc822', 00163 '.mhtml' => 'message/rfc822', 00164 '.mid' => 'audio/mid', 00165 '.midi' => 'audio/mid', 00166 '.mod' => 'video/mpeg', 00167 '.mov' => 'video/quicktime', 00168 '.mp2' => 'video/mpeg', 00169 '.mp2v' => 'video/mpeg', 00170 '.mp3' => 'audio/mpeg', 00171 '.mp4' => 'video/mp4', 00172 '.mpa' => 'video/mpeg', 00173 '.mpe' => 'video/mpeg', 00174 '.mpeg' => 'video/mpeg', 00175 '.mpf' => 'application/vnd.ms-mediapackage', 00176 '.mpg' => 'video/mpeg', 00177 '.mpv2' => 'video/mpeg', 00178 '.mqv' => 'video/quicktime', 00179 '.NMW' => 'application/nmwb', 00180 '.nws' => 'message/rfc822', 00181 '.odc' => 'text/x-ms-odc', 00182 '.ols' => 'application/vnd.ms-publisher', 00183 '.p10' => 'application/pkcs10', 00184 '.p12' => 'application/x-pkcs12', 00185 '.p7b' => 'application/x-pkcs7-certificates', 00186 '.p7c' => 'application/pkcs7-mime', 00187 '.p7m' => 'application/pkcs7-mime', 00188 '.p7r' => 'application/x-pkcs7-certreqresp', 00189 '.p7s' => 'application/pkcs7-signature', 00190 '.pct' => 'image/pict', 00191 '.pdf' => 'application/pdf', 00192 '.pdx' => 'application/vnd.adobe.pdx', 00193 '.pfx' => 'application/x-pkcs12', 00194 '.pic' => 'image/pict', 00195 '.pict' => 'image/pict', 00196 '.pinstall' => 'application/x-picasa-detect', 00197 '.pko' => 'application/vnd.ms-pki.pko', 00198 '.png' => 'image/png', 00199 '.pnt' => 'image/x-macpaint', 00200 '.pntg' => 'image/x-macpaint', 00201 '.pot' => 'application/vnd.ms-powerpoint', 00202 '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12', 00203 '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template', 00204 '.ppa' => 'application/vnd.ms-powerpoint', 00205 '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12', 00206 '.pps' => 'application/vnd.ms-powerpoint', 00207 '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12', 00208 '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', 00209 '.ppt' => 'application/vnd.ms-powerpoint', 00210 '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12', 00211 '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 00212 '.prf' => 'application/pics-rules', 00213 '.ps' => 'application/postscript', 00214 '.pub' => 'application/vnd.ms-publisher', 00215 '.pwz' => 'application/vnd.ms-powerpoint', 00216 '.py' => 'text/plain', 00217 '.pyw' => 'text/plain', 00218 '.qht' => 'text/x-html-insertion', 00219 '.qhtm' => 'text/x-html-insertion', 00220 '.qt' => 'video/quicktime', 00221 '.qti' => 'image/x-quicktime', 00222 '.qtif' => 'image/x-quicktime', 00223 '.qtl' => 'application/x-quicktimeplayer', 00224 '.rat' => 'application/rat-file', 00225 '.rmf' => 'application/vnd.adobe.rmf', 00226 '.rmi' => 'audio/mid', 00227 '.rqy' => 'text/x-ms-rqy', 00228 '.rtf' => 'application/msword', 00229 '.sct' => 'text/scriptlet', 00230 '.sd2' => 'audio/x-sd2', 00231 '.sdp' => 'application/sdp', 00232 '.shtml' => 'text/html', 00233 '.sit' => 'application/x-stuffit', 00234 '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12', 00235 '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide', 00236 '.slk' => 'application/vnd.ms-excel', 00237 '.snd' => 'audio/basic', 00238 '.so' => 'application/x-apachemodule', 00239 '.sol' => 'text/plain', 00240 '.sor' => 'text/plain', 00241 '.spc' => 'application/x-pkcs7-certificates', 00242 '.spl' => 'application/futuresplash', 00243 '.sst' => 'application/vnd.ms-pki.certstore', 00244 '.stl' => 'application/vnd.ms-pki.stl', 00245 '.swf' => 'application/x-shockwave-flash', 00246 '.thmx' => 'application/vnd.ms-officetheme', 00247 '.tif' => 'image/tiff', 00248 '.tiff' => 'image/tiff', 00249 '.txt' => 'text/plain', 00250 '.uls' => 'text/iuls', 00251 '.vcf' => 'text/x-vcard', 00252 '.vdx' => 'application/vnd.ms-visio.viewer', 00253 '.vsd' => 'application/vnd.ms-visio.viewer', 00254 '.vss' => 'application/vnd.ms-visio.viewer', 00255 '.vst' => 'application/vnd.ms-visio.viewer', 00256 '.vsx' => 'application/vnd.ms-visio.viewer', 00257 '.vtx' => 'application/vnd.ms-visio.viewer', 00258 '.wav' => 'audio/wav', 00259 '.wax' => 'audio/x-ms-wax', 00260 '.wbk' => 'application/msword', 00261 '.wdp' => 'image/vnd.ms-photo', 00262 '.wiz' => 'application/msword', 00263 '.wm' => 'video/x-ms-wm', 00264 '.wma' => 'audio/x-ms-wma', 00265 '.wmd' => 'application/x-ms-wmd', 00266 '.wmv' => 'video/x-ms-wmv', 00267 '.wmx' => 'video/x-ms-wmx', 00268 '.wmz' => 'application/x-ms-wmz', 00269 '.wpl' => 'application/vnd.ms-wpl', 00270 '.wsc' => 'text/scriptlet', 00271 '.wvx' => 'video/x-ms-wvx', 00272 '.xaml' => 'application/xaml+xml', 00273 '.xbap' => 'application/x-ms-xbap', 00274 '.xdp' => 'application/vnd.adobe.xdp+xml', 00275 '.xfdf' => 'application/vnd.adobe.xfdf', 00276 '.xht' => 'application/xhtml+xml', 00277 '.xhtml' => 'application/xhtml+xml', 00278 '.xla' => 'application/vnd.ms-excel', 00279 '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12', 00280 '.xlk' => 'application/vnd.ms-excel', 00281 '.xll' => 'application/vnd.ms-excel', 00282 '.xlm' => 'application/vnd.ms-excel', 00283 '.xls' => 'application/vnd.ms-excel', 00284 '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12', 00285 '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12', 00286 '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 00287 '.xlt' => 'application/vnd.ms-excel', 00288 '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12', 00289 '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template', 00290 '.xlw' => 'application/vnd.ms-excel', 00291 '.xml' => 'text/xml', 00292 '.xps' => 'application/vnd.ms-xpsdocument', 00293 '.xsl' => 'text/xml', 00294 ); 00295 00301 protected $versions = array( 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' ); 00302 00306 protected $typeTable = array(); 00307 00309 function __construct() { 00310 // Construct versioned type arrays from the base type array plus additions 00311 $types = $this->baseTypeTable; 00312 foreach ( $this->versions as $version ) { 00313 if ( isset( $this->addedTypes[$version] ) ) { 00314 foreach ( $this->addedTypes[$version] as $format => $addedTypes ) { 00315 $types[$format] = array_merge( $types[$format], $addedTypes ); 00316 } 00317 } 00318 $this->typeTable[$version] = $types; 00319 } 00320 } 00321 00332 public function getRealMimesFromData( $fileName, $chunk, $proposed ) { 00333 $types = $this->getMimesFromData( $fileName, $chunk, $proposed ); 00334 $types = array_map( array( $this, 'translateMimeType' ), $types ); 00335 return $types; 00336 } 00337 00344 public function translateMimeType( $type ) { 00345 static $table = array( 00346 'image/pjpeg' => 'image/jpeg', 00347 'image/x-png' => 'image/png', 00348 'image/x-wmf' => 'application/x-msmetafile', 00349 'image/bmp' => 'image/x-bmp', 00350 'application/x-zip-compressed' => 'application/zip', 00351 'application/x-compressed' => 'application/x-compress', 00352 'application/x-gzip-compressed' => 'application/x-gzip', 00353 'audio/mid' => 'audio/midi', 00354 ); 00355 if ( isset( $table[$type] ) ) { 00356 $type = $table[$type]; 00357 } 00358 return $type; 00359 } 00360 00370 public function getMimesFromData( $fileName, $chunk, $proposed ) { 00371 $types = array(); 00372 foreach ( $this->versions as $version ) { 00373 $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ); 00374 } 00375 return $types; 00376 } 00377 00386 protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) { 00387 // Strip text after a semicolon 00388 $semiPos = strpos( $proposed, ';' ); 00389 if ( $semiPos !== false ) { 00390 $proposed = substr( $proposed, 0, $semiPos ); 00391 } 00392 00393 $proposedFormat = $this->getDataFormat( $version, $proposed ); 00394 if ( $proposedFormat == 'unknown' 00395 && $proposed != 'multipart/mixed' 00396 && $proposed != 'multipart/x-mixed-replace' ) 00397 { 00398 return $proposed; 00399 } 00400 if ( strval( $chunk ) === '' ) { 00401 return $proposed; 00402 } 00403 00404 // Truncate chunk at 255 bytes 00405 $chunk = substr( $chunk, 0, 255 ); 00406 00407 // IE does the Check*Headers() calls last, and instead does the following image 00408 // type checks by directly looking for the magic numbers. What I do here should 00409 // have the same effect since the magic number checks are identical in both cases. 00410 $result = $this->sampleData( $version, $chunk ); 00411 $sampleFound = $result['found']; 00412 $counters = $result['counters']; 00413 $binaryType = $this->checkBinaryHeaders( $version, $chunk ); 00414 $textType = $this->checkTextHeaders( $version, $chunk ); 00415 00416 if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) { 00417 return 'text/html'; 00418 } 00419 if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) { 00420 return 'image/gif'; 00421 } 00422 if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' ) 00423 && $binaryType == 'image/pjpeg' ) 00424 { 00425 return $proposed; 00426 } 00427 // PNG check added in IE 7 00428 if ( $version >= 'ie07' 00429 && ( $proposed == 'image/x-png' || $proposed == 'image/png' ) 00430 && $binaryType == 'image/x-png' ) 00431 { 00432 return $proposed; 00433 } 00434 00435 // CDF was removed in IE 7 so it won't be in $sampleFound for later versions 00436 if ( isset( $sampleFound['cdf'] ) ) { 00437 return 'application/x-cdf'; 00438 } 00439 00440 // RSS and Atom were added in IE 7 so they won't be in $sampleFound for 00441 // previous versions 00442 if ( isset( $sampleFound['rss'] ) ) { 00443 return 'application/rss+xml'; 00444 } 00445 if ( isset( $sampleFound['rdf-tag'] ) 00446 && isset( $sampleFound['rdf-url'] ) 00447 && isset( $sampleFound['rdf-purl'] ) ) 00448 { 00449 return 'application/rss+xml'; 00450 } 00451 if ( isset( $sampleFound['atom'] ) ) { 00452 return 'application/atom+xml'; 00453 } 00454 00455 if ( isset( $sampleFound['xml'] ) ) { 00456 // TODO: I'm not sure under what circumstances this flag is enabled 00457 if ( strpos( $version, 'strict' ) !== false ) { 00458 if ( $proposed == 'text/html' || $proposed == 'text/xml' ) { 00459 return 'text/xml'; 00460 } 00461 } else { 00462 return 'text/xml'; 00463 } 00464 } 00465 if ( isset( $sampleFound['html'] ) ) { 00466 // TODO: I'm not sure under what circumstances this flag is enabled 00467 if ( strpos( $version, 'nohtml' ) !== false ) { 00468 if ( $proposed == 'text/plain' ) { 00469 return 'text/html'; 00470 } 00471 } else { 00472 return 'text/html'; 00473 } 00474 } 00475 if ( isset( $sampleFound['xbm'] ) ) { 00476 return 'image/x-bitmap'; 00477 } 00478 if ( isset( $sampleFound['binhex'] ) ) { 00479 return 'application/macbinhex40'; 00480 } 00481 if ( isset( $sampleFound['scriptlet'] ) ) { 00482 if ( strpos( $version, 'strict' ) !== false ) { 00483 if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) { 00484 return 'text/scriptlet'; 00485 } 00486 } else { 00487 return 'text/scriptlet'; 00488 } 00489 } 00490 00491 // Freaky heuristics to determine if the data is text or binary 00492 // The heuristic is of course broken for non-ASCII text 00493 if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] ) 00494 < ( $counters['ctrl'] + $counters['high'] ) * 16 ) 00495 { 00496 $kindOfBinary = true; 00497 $type = $binaryType ? $binaryType : $textType; 00498 if ( $type === false ) { 00499 $type = 'application/octet-stream'; 00500 } 00501 } else { 00502 $kindOfBinary = false; 00503 $type = $textType ? $textType : $binaryType; 00504 if ( $type === false ) { 00505 $type = 'text/plain'; 00506 } 00507 } 00508 00509 // Check if the output format is ambiguous 00510 // This generally means that detection failed, real types aren't ambiguous 00511 $detectedFormat = $this->getDataFormat( $version, $type ); 00512 if ( $detectedFormat != 'ambiguous' ) { 00513 return $type; 00514 } 00515 00516 if ( $proposedFormat != 'ambiguous' ) { 00517 // FormatAgreesWithData() 00518 if ( $proposedFormat == 'text' && !$kindOfBinary ) { 00519 return $proposed; 00520 } 00521 if ( $proposedFormat == 'binary' && $kindOfBinary ) { 00522 return $proposed; 00523 } 00524 if ( $proposedFormat == 'html' ) { 00525 return $proposed; 00526 } 00527 } 00528 00529 // Find a MIME type by searching the registry for the file extension. 00530 $dotPos = strrpos( $fileName, '.' ); 00531 if ( $dotPos === false ) { 00532 return $type; 00533 } 00534 $ext = substr( $fileName, $dotPos ); 00535 if ( isset( $this->registry[$ext] ) ) { 00536 return $this->registry[$ext]; 00537 } 00538 00539 // TODO: If the extension has an application registered to it, IE will return 00540 // application/octet-stream. We'll skip that, so we could erroneously 00541 // return text/plain or application/x-netcdf where application/octet-stream 00542 // would be correct. 00543 00544 return $type; 00545 } 00546 00554 private function checkTextHeaders( $version, $chunk ) { 00555 $chunk2 = substr( $chunk, 0, 2 ); 00556 $chunk4 = substr( $chunk, 0, 4 ); 00557 $chunk5 = substr( $chunk, 0, 5 ); 00558 if ( $chunk4 == '%PDF' ) { 00559 return 'application/pdf'; 00560 } 00561 if ( $chunk2 == '%!' ) { 00562 return 'application/postscript'; 00563 } 00564 if ( $chunk5 == '{\\rtf' ) { 00565 return 'text/richtext'; 00566 } 00567 if ( $chunk5 == 'begin' ) { 00568 return 'application/base64'; 00569 } 00570 return false; 00571 } 00572 00580 private function checkBinaryHeaders( $version, $chunk ) { 00581 $chunk2 = substr( $chunk, 0, 2 ); 00582 $chunk3 = substr( $chunk, 0, 3 ); 00583 $chunk4 = substr( $chunk, 0, 4 ); 00584 $chunk5 = substr( $chunk, 0, 5 ); 00585 $chunk5uc = strtoupper( $chunk5 ); 00586 $chunk8 = substr( $chunk, 0, 8 ); 00587 if ( $chunk5uc == 'GIF87' || $chunk5uc == 'GIF89' ) { 00588 return 'image/gif'; 00589 } 00590 if ( $chunk2 == "\xff\xd8" ) { 00591 return 'image/pjpeg'; // actually plain JPEG but this is what IE returns 00592 } 00593 00594 if ( $chunk2 == 'BM' 00595 && substr( $chunk, 6, 2 ) == "\000\000" 00596 && substr( $chunk, 8, 2 ) == "\000\000" ) 00597 { 00598 return 'image/bmp'; // another non-standard MIME 00599 } 00600 if ( $chunk4 == 'RIFF' 00601 && substr( $chunk, 8, 4 ) == 'WAVE' ) 00602 { 00603 return 'audio/wav'; 00604 } 00605 // These were integer literals in IE 00606 // Perhaps the author was not sure what the target endianness was 00607 if ( $chunk4 == ".sd\000" 00608 || $chunk4 == ".snd" 00609 || $chunk4 == "\000ds." 00610 || $chunk4 == "dns." ) 00611 { 00612 return 'audio/basic'; 00613 } 00614 if ( $chunk3 == "MM\000" ) { 00615 return 'image/tiff'; 00616 } 00617 if ( $chunk2 == 'MZ' ) { 00618 return 'application/x-msdownload'; 00619 } 00620 if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) { 00621 return 'image/x-png'; // [sic] 00622 } 00623 if ( strlen( $chunk ) >= 5 ) { 00624 $byte2 = ord( $chunk[2] ); 00625 $byte4 = ord( $chunk[4] ); 00626 if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) { 00627 return 'image/x-jg'; 00628 } 00629 } 00630 // More endian confusion? 00631 if ( $chunk4 == 'MROF' ) { 00632 return 'audio/x-aiff'; 00633 } 00634 $chunk4_8 = substr( $chunk, 8, 4 ); 00635 if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) { 00636 return 'audio/x-aiff'; 00637 } 00638 if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) { 00639 return 'video/avi'; 00640 } 00641 if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) { 00642 return 'video/mpeg'; 00643 } 00644 if ( $chunk4 == "\001\000\000\000" 00645 && substr( $chunk, 40, 4 ) == ' EMF' ) 00646 { 00647 return 'image/x-emf'; 00648 } 00649 if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) { 00650 return 'image/x-wmf'; 00651 } 00652 if ( $chunk4 == "\xca\xfe\xba\xbe" ) { 00653 return 'application/java'; 00654 } 00655 if ( $chunk2 == 'PK' ) { 00656 return 'application/x-zip-compressed'; 00657 } 00658 if ( $chunk2 == "\x1f\x9d" ) { 00659 return 'application/x-compressed'; 00660 } 00661 if ( $chunk2 == "\x1f\x8b" ) { 00662 return 'application/x-gzip-compressed'; 00663 } 00664 // Skip redundant check for ZIP 00665 if ( $chunk5 == "MThd\000" ) { 00666 return 'audio/mid'; 00667 } 00668 if ( $chunk4 == '%PDF' ) { 00669 return 'application/pdf'; 00670 } 00671 return false; 00672 } 00673 00681 protected function sampleData( $version, $chunk ) { 00682 $found = array(); 00683 $counters = array( 00684 'ctrl' => 0, 00685 'high' => 0, 00686 'low' => 0, 00687 'lf' => 0, 00688 'cr' => 0, 00689 'ff' => 0 00690 ); 00691 $htmlTags = array( 00692 'html', 00693 'head', 00694 'title', 00695 'body', 00696 'script', 00697 'a href', 00698 'pre', 00699 'img', 00700 'plaintext', 00701 'table' 00702 ); 00703 $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'; 00704 $rdfPurl = 'http://purl.org/rss/1.0/'; 00705 $xbmMagic1 = '#define'; 00706 $xbmMagic2 = '_width'; 00707 $xbmMagic3 = '_bits'; 00708 $binhexMagic = 'converted with BinHex'; 00709 00710 for ( $offset = 0; $offset < strlen( $chunk ); $offset++ ) { 00711 $curChar = $chunk[$offset]; 00712 if ( $curChar == "\x0a" ) { 00713 $counters['lf']++; 00714 continue; 00715 } elseif ( $curChar == "\x0d" ) { 00716 $counters['cr']++; 00717 continue; 00718 } elseif ( $curChar == "\x0c" ) { 00719 $counters['ff']++; 00720 continue; 00721 } elseif ( $curChar == "\t" ) { 00722 $counters['low']++; 00723 continue; 00724 } elseif ( ord( $curChar ) < 32 ) { 00725 $counters['ctrl']++; 00726 continue; 00727 } elseif ( ord( $curChar ) >= 128 ) { 00728 $counters['high']++; 00729 continue; 00730 } 00731 00732 $counters['low']++; 00733 if ( $curChar == '<' ) { 00734 // XML 00735 $remainder = substr( $chunk, $offset + 1 ); 00736 if ( !strncasecmp( $remainder, '?XML', 4 ) ) { 00737 $nextChar = substr( $chunk, $offset + 5, 1 ); 00738 if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) { 00739 $found['xml'] = true; 00740 } 00741 } 00742 // Scriptlet (JSP) 00743 if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) { 00744 $found['scriptlet'] = true; 00745 break; 00746 } 00747 // HTML 00748 foreach ( $htmlTags as $tag ) { 00749 if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) { 00750 $found['html'] = true; 00751 } 00752 } 00753 // Skip broken check for additional tags (HR etc.) 00754 00755 // CHANNEL replaced by RSS, RDF and FEED in IE 7 00756 if ( $version < 'ie07' ) { 00757 if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) { 00758 $found['cdf'] = true; 00759 } 00760 } else { 00761 // RSS 00762 if ( !strncasecmp( $remainder, 'RSS', 3 ) ) { 00763 $found['rss'] = true; 00764 break; // return from SampleData 00765 } 00766 if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) { 00767 $found['rdf-tag'] = true; 00768 // no break 00769 } 00770 if ( !strncasecmp( $remainder, 'FEED', 4 ) ) { 00771 $found['atom'] = true; 00772 break; 00773 } 00774 } 00775 continue; 00776 } 00777 // Skip broken check for --> 00778 00779 // RSS URL checks 00780 // For some reason both URLs must appear before it is recognised 00781 $remainder = substr( $chunk, $offset ); 00782 if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) { 00783 $found['rdf-url'] = true; 00784 if ( isset( $found['rdf-tag'] ) 00785 && isset( $found['rdf-purl'] ) ) // [sic] 00786 { 00787 break; 00788 } 00789 continue; 00790 } 00791 00792 if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) { 00793 if ( isset( $found['rdf-tag'] ) 00794 && isset( $found['rdf-url'] ) ) // [sic] 00795 { 00796 break; 00797 } 00798 continue; 00799 } 00800 00801 // XBM checks 00802 if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) { 00803 $found['xbm1'] = true; 00804 continue; 00805 } 00806 if ( $curChar == '_' ) { 00807 if ( isset( $found['xbm2'] ) ) { 00808 if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) { 00809 $found['xbm'] = true; 00810 break; 00811 } 00812 } elseif ( isset( $found['xbm1'] ) ) { 00813 if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) { 00814 $found['xbm2'] = true; 00815 } 00816 } 00817 } 00818 00819 // BinHex 00820 if ( !strncmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) { 00821 $found['binhex'] = true; 00822 } 00823 } 00824 return array( 'found' => $found, 'counters' => $counters ); 00825 } 00826 00832 protected function getDataFormat( $version, $type ) { 00833 $types = $this->typeTable[$version]; 00834 if ( $type == '(null)' || strval( $type ) === '' ) { 00835 return 'ambiguous'; 00836 } 00837 foreach ( $types as $format => $list ) { 00838 if ( in_array( $type, $list ) ) { 00839 return $format; 00840 } 00841 } 00842 return 'unknown'; 00843 } 00844 } 00845