MediaWiki  REL1_19
IEContentAnalyzer.php
Go to the documentation of this file.
00001 <?php
00002 
00021 class IEContentAnalyzer {
00025         protected $baseTypeTable = array(
00026                 'ambiguous' /*1*/ => array(
00027                         'text/plain',
00028                         'application/octet-stream',
00029                         'application/x-netcdf', // [sic]
00030                 ),
00031                 'text' /*3*/ => array(
00032                         'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64',
00033                         'application/macbinhex40', 'application/x-cdf', 'text/scriptlet'
00034                 ),
00035                 'binary' /*4*/ => array(
00036                         'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif',
00037                         'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp',
00038                         'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi',
00039                         'video/x-msvideo', 'video/mpeg', 'application/x-compressed',
00040                         'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java',
00041                         'application/x-msdownload'
00042                 ),
00043                 'html' /*5*/ => array( 'text/html' ),
00044         );
00045 
00049         protected $addedTypes = array(
00050                 'ie07' => array(
00051                         'text' => array( 'text/xml', 'application/xml' )
00052                 ),
00053         );
00054 
00061         protected $registry = array(
00062                 '.323' => 'text/h323',
00063                 '.3g2' => 'video/3gpp2',
00064                 '.3gp' => 'video/3gpp',
00065                 '.3gp2' => 'video/3gpp2',
00066                 '.3gpp' => 'video/3gpp',
00067                 '.aac' => 'audio/aac',
00068                 '.ac3' => 'audio/ac3',
00069                 '.accda' => 'application/msaccess',
00070                 '.accdb' => 'application/msaccess',
00071                 '.accdc' => 'application/msaccess',
00072                 '.accde' => 'application/msaccess',
00073                 '.accdr' => 'application/msaccess',
00074                 '.accdt' => 'application/msaccess',
00075                 '.ade' => 'application/msaccess',
00076                 '.adp' => 'application/msaccess',
00077                 '.adts' => 'audio/aac',
00078                 '.ai' => 'application/postscript',
00079                 '.aif' => 'audio/aiff',
00080                 '.aifc' => 'audio/aiff',
00081                 '.aiff' => 'audio/aiff',
00082                 '.amc' => 'application/x-mpeg',
00083                 '.application' => 'application/x-ms-application',
00084                 '.asf' => 'video/x-ms-asf',
00085                 '.asx' => 'video/x-ms-asf',
00086                 '.au' => 'audio/basic',
00087                 '.avi' => 'video/avi',
00088                 '.bmp' => 'image/bmp',
00089                 '.caf' => 'audio/x-caf',
00090                 '.cat' => 'application/vnd.ms-pki.seccat',
00091                 '.cbo' => 'application/sha',
00092                 '.cdda' => 'audio/aiff',
00093                 '.cer' => 'application/x-x509-ca-cert',
00094                 '.conf' => 'text/plain',
00095                 '.crl' => 'application/pkix-crl',
00096                 '.crt' => 'application/x-x509-ca-cert',
00097                 '.css' => 'text/css',
00098                 '.csv' => 'application/vnd.ms-excel',
00099                 '.der' => 'application/x-x509-ca-cert',
00100                 '.dib' => 'image/bmp',
00101                 '.dif' => 'video/x-dv',
00102                 '.dll' => 'application/x-msdownload',
00103                 '.doc' => 'application/msword',
00104                 '.docm' => 'application/vnd.ms-word.document.macroEnabled.12',
00105                 '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
00106                 '.dot' => 'application/msword',
00107                 '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12',
00108                 '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
00109                 '.dv' => 'video/x-dv',
00110                 '.dwfx' => 'model/vnd.dwfx+xps',
00111                 '.edn' => 'application/vnd.adobe.edn',
00112                 '.eml' => 'message/rfc822',
00113                 '.eps' => 'application/postscript',
00114                 '.etd' => 'application/x-ebx',
00115                 '.exe' => 'application/x-msdownload',
00116                 '.fdf' => 'application/vnd.fdf',
00117                 '.fif' => 'application/fractals',
00118                 '.gif' => 'image/gif',
00119                 '.gsm' => 'audio/x-gsm',
00120                 '.hqx' => 'application/mac-binhex40',
00121                 '.hta' => 'application/hta',
00122                 '.htc' => 'text/x-component',
00123                 '.htm' => 'text/html',
00124                 '.html' => 'text/html',
00125                 '.htt' => 'text/webviewhtml',
00126                 '.hxa' => 'application/xml',
00127                 '.hxc' => 'application/xml',
00128                 '.hxd' => 'application/octet-stream',
00129                 '.hxe' => 'application/xml',
00130                 '.hxf' => 'application/xml',
00131                 '.hxh' => 'application/octet-stream',
00132                 '.hxi' => 'application/octet-stream',
00133                 '.hxk' => 'application/xml',
00134                 '.hxq' => 'application/octet-stream',
00135                 '.hxr' => 'application/octet-stream',
00136                 '.hxs' => 'application/octet-stream',
00137                 '.hxt' => 'application/xml',
00138                 '.hxv' => 'application/xml',
00139                 '.hxw' => 'application/octet-stream',
00140                 '.ico' => 'image/x-icon',
00141                 '.iii' => 'application/x-iphone',
00142                 '.ins' => 'application/x-internet-signup',
00143                 '.iqy' => 'text/x-ms-iqy',
00144                 '.isp' => 'application/x-internet-signup',
00145                 '.jfif' => 'image/jpeg',
00146                 '.jnlp' => 'application/x-java-jnlp-file',
00147                 '.jpe' => 'image/jpeg',
00148                 '.jpeg' => 'image/jpeg',
00149                 '.jpg' => 'image/jpeg',
00150                 '.jtx' => 'application/x-jtx+xps',
00151                 '.latex' => 'application/x-latex',
00152                 '.log' => 'text/plain',
00153                 '.m1v' => 'video/mpeg',
00154                 '.m2v' => 'video/mpeg',
00155                 '.m3u' => 'audio/x-mpegurl',
00156                 '.mac' => 'image/x-macpaint',
00157                 '.man' => 'application/x-troff-man',
00158                 '.mda' => 'application/msaccess',
00159                 '.mdb' => 'application/msaccess',
00160                 '.mde' => 'application/msaccess',
00161                 '.mfp' => 'application/x-shockwave-flash',
00162                 '.mht' => 'message/rfc822',
00163                 '.mhtml' => 'message/rfc822',
00164                 '.mid' => 'audio/mid',
00165                 '.midi' => 'audio/mid',
00166                 '.mod' => 'video/mpeg',
00167                 '.mov' => 'video/quicktime',
00168                 '.mp2' => 'video/mpeg',
00169                 '.mp2v' => 'video/mpeg',
00170                 '.mp3' => 'audio/mpeg',
00171                 '.mp4' => 'video/mp4',
00172                 '.mpa' => 'video/mpeg',
00173                 '.mpe' => 'video/mpeg',
00174                 '.mpeg' => 'video/mpeg',
00175                 '.mpf' => 'application/vnd.ms-mediapackage',
00176                 '.mpg' => 'video/mpeg',
00177                 '.mpv2' => 'video/mpeg',
00178                 '.mqv' => 'video/quicktime',
00179                 '.NMW' => 'application/nmwb',
00180                 '.nws' => 'message/rfc822',
00181                 '.odc' => 'text/x-ms-odc',
00182                 '.ols' => 'application/vnd.ms-publisher',
00183                 '.p10' => 'application/pkcs10',
00184                 '.p12' => 'application/x-pkcs12',
00185                 '.p7b' => 'application/x-pkcs7-certificates',
00186                 '.p7c' => 'application/pkcs7-mime',
00187                 '.p7m' => 'application/pkcs7-mime',
00188                 '.p7r' => 'application/x-pkcs7-certreqresp',
00189                 '.p7s' => 'application/pkcs7-signature',
00190                 '.pct' => 'image/pict',
00191                 '.pdf' => 'application/pdf',
00192                 '.pdx' => 'application/vnd.adobe.pdx',
00193                 '.pfx' => 'application/x-pkcs12',
00194                 '.pic' => 'image/pict',
00195                 '.pict' => 'image/pict',
00196                 '.pinstall' => 'application/x-picasa-detect',
00197                 '.pko' => 'application/vnd.ms-pki.pko',
00198                 '.png' => 'image/png',
00199                 '.pnt' => 'image/x-macpaint',
00200                 '.pntg' => 'image/x-macpaint',
00201                 '.pot' => 'application/vnd.ms-powerpoint',
00202                 '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12',
00203                 '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template',
00204                 '.ppa' => 'application/vnd.ms-powerpoint',
00205                 '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12',
00206                 '.pps' => 'application/vnd.ms-powerpoint',
00207                 '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12',
00208                 '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
00209                 '.ppt' => 'application/vnd.ms-powerpoint',
00210                 '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12',
00211                 '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
00212                 '.prf' => 'application/pics-rules',
00213                 '.ps' => 'application/postscript',
00214                 '.pub' => 'application/vnd.ms-publisher',
00215                 '.pwz' => 'application/vnd.ms-powerpoint',
00216                 '.py' => 'text/plain',
00217                 '.pyw' => 'text/plain',
00218                 '.qht' => 'text/x-html-insertion',
00219                 '.qhtm' => 'text/x-html-insertion',
00220                 '.qt' => 'video/quicktime',
00221                 '.qti' => 'image/x-quicktime',
00222                 '.qtif' => 'image/x-quicktime',
00223                 '.qtl' => 'application/x-quicktimeplayer',
00224                 '.rat' => 'application/rat-file',
00225                 '.rmf' => 'application/vnd.adobe.rmf',
00226                 '.rmi' => 'audio/mid',
00227                 '.rqy' => 'text/x-ms-rqy',
00228                 '.rtf' => 'application/msword',
00229                 '.sct' => 'text/scriptlet',
00230                 '.sd2' => 'audio/x-sd2',
00231                 '.sdp' => 'application/sdp',
00232                 '.shtml' => 'text/html',
00233                 '.sit' => 'application/x-stuffit',
00234                 '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12',
00235                 '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide',
00236                 '.slk' => 'application/vnd.ms-excel',
00237                 '.snd' => 'audio/basic',
00238                 '.so' => 'application/x-apachemodule',
00239                 '.sol' => 'text/plain',
00240                 '.sor' => 'text/plain',
00241                 '.spc' => 'application/x-pkcs7-certificates',
00242                 '.spl' => 'application/futuresplash',
00243                 '.sst' => 'application/vnd.ms-pki.certstore',
00244                 '.stl' => 'application/vnd.ms-pki.stl',
00245                 '.swf' => 'application/x-shockwave-flash',
00246                 '.thmx' => 'application/vnd.ms-officetheme',
00247                 '.tif' => 'image/tiff',
00248                 '.tiff' => 'image/tiff',
00249                 '.txt' => 'text/plain',
00250                 '.uls' => 'text/iuls',
00251                 '.vcf' => 'text/x-vcard',
00252                 '.vdx' => 'application/vnd.ms-visio.viewer',
00253                 '.vsd' => 'application/vnd.ms-visio.viewer',
00254                 '.vss' => 'application/vnd.ms-visio.viewer',
00255                 '.vst' => 'application/vnd.ms-visio.viewer',
00256                 '.vsx' => 'application/vnd.ms-visio.viewer',
00257                 '.vtx' => 'application/vnd.ms-visio.viewer',
00258                 '.wav' => 'audio/wav',
00259                 '.wax' => 'audio/x-ms-wax',
00260                 '.wbk' => 'application/msword',
00261                 '.wdp' => 'image/vnd.ms-photo',
00262                 '.wiz' => 'application/msword',
00263                 '.wm' => 'video/x-ms-wm',
00264                 '.wma' => 'audio/x-ms-wma',
00265                 '.wmd' => 'application/x-ms-wmd',
00266                 '.wmv' => 'video/x-ms-wmv',
00267                 '.wmx' => 'video/x-ms-wmx',
00268                 '.wmz' => 'application/x-ms-wmz',
00269                 '.wpl' => 'application/vnd.ms-wpl',
00270                 '.wsc' => 'text/scriptlet',
00271                 '.wvx' => 'video/x-ms-wvx',
00272                 '.xaml' => 'application/xaml+xml',
00273                 '.xbap' => 'application/x-ms-xbap',
00274                 '.xdp' => 'application/vnd.adobe.xdp+xml',
00275                 '.xfdf' => 'application/vnd.adobe.xfdf',
00276                 '.xht' => 'application/xhtml+xml',
00277                 '.xhtml' => 'application/xhtml+xml',
00278                 '.xla' => 'application/vnd.ms-excel',
00279                 '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12',
00280                 '.xlk' => 'application/vnd.ms-excel',
00281                 '.xll' => 'application/vnd.ms-excel',
00282                 '.xlm' => 'application/vnd.ms-excel',
00283                 '.xls' => 'application/vnd.ms-excel',
00284                 '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
00285                 '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12',
00286                 '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
00287                 '.xlt' => 'application/vnd.ms-excel',
00288                 '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12',
00289                 '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
00290                 '.xlw' => 'application/vnd.ms-excel',
00291                 '.xml' => 'text/xml',
00292                 '.xps' => 'application/vnd.ms-xpsdocument',
00293                 '.xsl' => 'text/xml',
00294         );
00295 
00301         protected $versions = array( 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' );
00302 
00306         protected $typeTable = array();
00307 
00309         function __construct() {
00310                 // Construct versioned type arrays from the base type array plus additions
00311                 $types = $this->baseTypeTable;
00312                 foreach ( $this->versions as $version ) {
00313                         if ( isset( $this->addedTypes[$version] ) ) {
00314                                 foreach ( $this->addedTypes[$version] as $format => $addedTypes ) {
00315                                         $types[$format] = array_merge( $types[$format], $addedTypes );
00316                                 }
00317                         }
00318                         $this->typeTable[$version] = $types;
00319                 }
00320         }
00321 
00332         public function getRealMimesFromData( $fileName, $chunk, $proposed ) {
00333                 $types = $this->getMimesFromData( $fileName, $chunk, $proposed );
00334                 $types = array_map( array( $this, 'translateMimeType' ), $types );
00335                 return $types;
00336         }
00337 
00344         public function translateMimeType( $type ) {
00345                 static $table = array(
00346                         'image/pjpeg' => 'image/jpeg',
00347                         'image/x-png' => 'image/png',
00348                         'image/x-wmf' => 'application/x-msmetafile',
00349                         'image/bmp' => 'image/x-bmp',
00350                         'application/x-zip-compressed' => 'application/zip',
00351                         'application/x-compressed' => 'application/x-compress',
00352                         'application/x-gzip-compressed' => 'application/x-gzip',
00353                         'audio/mid' => 'audio/midi',
00354                 );
00355                 if ( isset( $table[$type] ) ) {
00356                         $type = $table[$type];
00357                 }
00358                 return $type;
00359         }
00360 
00370         public function getMimesFromData( $fileName, $chunk, $proposed ) {
00371                 $types = array();
00372                 foreach ( $this->versions as $version ) {
00373                         $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed );
00374                 }
00375                 return $types;
00376         }
00377 
00386         protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) {
00387                 // Strip text after a semicolon
00388                 $semiPos = strpos( $proposed, ';' );
00389                 if ( $semiPos !== false ) {
00390                         $proposed = substr( $proposed, 0, $semiPos );
00391                 }
00392 
00393                 $proposedFormat = $this->getDataFormat( $version, $proposed );
00394                 if ( $proposedFormat == 'unknown'
00395                         && $proposed != 'multipart/mixed'
00396                         && $proposed != 'multipart/x-mixed-replace' )
00397                 {
00398                         return $proposed;
00399                 }
00400                 if ( strval( $chunk ) === '' ) {
00401                         return $proposed;
00402                 }
00403 
00404                 // Truncate chunk at 255 bytes
00405                 $chunk = substr( $chunk, 0, 255 );
00406 
00407                 // IE does the Check*Headers() calls last, and instead does the following image
00408                 // type checks by directly looking for the magic numbers. What I do here should
00409                 // have the same effect since the magic number checks are identical in both cases.
00410                 $result = $this->sampleData( $version, $chunk );
00411                 $sampleFound = $result['found'];
00412                 $counters = $result['counters'];
00413                 $binaryType = $this->checkBinaryHeaders( $version, $chunk );
00414                 $textType = $this->checkTextHeaders( $version, $chunk );
00415 
00416                 if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) {
00417                         return 'text/html';
00418                 }
00419                 if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) {
00420                         return 'image/gif';
00421                 }
00422                 if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' )
00423                         && $binaryType == 'image/pjpeg' )
00424                 {
00425                         return $proposed;
00426                 }
00427                 // PNG check added in IE 7
00428                 if ( $version >= 'ie07'
00429                         && ( $proposed == 'image/x-png' || $proposed == 'image/png' )
00430                         && $binaryType == 'image/x-png' )
00431                 {
00432                         return $proposed;
00433                 }
00434 
00435                 // CDF was removed in IE 7 so it won't be in $sampleFound for later versions
00436                 if ( isset( $sampleFound['cdf'] ) ) {
00437                         return 'application/x-cdf';
00438                 }
00439 
00440                 // RSS and Atom were added in IE 7 so they won't be in $sampleFound for
00441                 // previous versions
00442                 if ( isset( $sampleFound['rss'] ) ) {
00443                         return 'application/rss+xml';
00444                 }
00445                 if ( isset( $sampleFound['rdf-tag'] )
00446                         && isset( $sampleFound['rdf-url'] )
00447                         && isset( $sampleFound['rdf-purl'] ) )
00448                 {
00449                         return 'application/rss+xml';
00450                 }
00451                 if ( isset( $sampleFound['atom'] ) ) {
00452                         return 'application/atom+xml';
00453                 }
00454 
00455                 if ( isset( $sampleFound['xml'] ) ) {
00456                         // TODO: I'm not sure under what circumstances this flag is enabled
00457                         if ( strpos( $version, 'strict' ) !== false ) {
00458                                 if ( $proposed == 'text/html' || $proposed == 'text/xml' ) {
00459                                         return 'text/xml';
00460                                 }
00461                         } else {
00462                                 return 'text/xml';
00463                         }
00464                 }
00465                 if ( isset( $sampleFound['html'] ) ) {
00466                         // TODO: I'm not sure under what circumstances this flag is enabled
00467                         if ( strpos( $version, 'nohtml' ) !== false ) {
00468                                 if ( $proposed == 'text/plain' ) {
00469                                         return 'text/html';
00470                                 }
00471                         } else {
00472                                 return 'text/html';
00473                         }
00474                 }
00475                 if ( isset( $sampleFound['xbm'] ) ) {
00476                         return 'image/x-bitmap';
00477                 }
00478                 if ( isset( $sampleFound['binhex'] ) ) {
00479                         return 'application/macbinhex40';
00480                 }
00481                 if ( isset( $sampleFound['scriptlet'] ) ) {
00482                         if ( strpos( $version, 'strict' ) !== false ) {
00483                                 if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) {
00484                                         return 'text/scriptlet';
00485                                 }
00486                         } else {
00487                                 return 'text/scriptlet';
00488                         }
00489                 }
00490 
00491                 // Freaky heuristics to determine if the data is text or binary
00492                 // The heuristic is of course broken for non-ASCII text
00493                 if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] )
00494                         < ( $counters['ctrl'] + $counters['high'] ) * 16 )
00495                 {
00496                         $kindOfBinary = true;
00497                         $type = $binaryType ? $binaryType : $textType;
00498                         if ( $type === false ) {
00499                                 $type = 'application/octet-stream';
00500                         }
00501                 } else {
00502                         $kindOfBinary = false;
00503                         $type = $textType ? $textType : $binaryType;
00504                         if ( $type === false ) {
00505                                 $type = 'text/plain';
00506                         }
00507                 }
00508 
00509                 // Check if the output format is ambiguous
00510                 // This generally means that detection failed, real types aren't ambiguous
00511                 $detectedFormat = $this->getDataFormat( $version, $type );
00512                 if ( $detectedFormat != 'ambiguous' ) {
00513                         return $type;
00514                 }
00515 
00516                 if ( $proposedFormat != 'ambiguous' ) {
00517                         // FormatAgreesWithData()
00518                         if ( $proposedFormat == 'text' && !$kindOfBinary ) {
00519                                 return $proposed;
00520                         }
00521                         if ( $proposedFormat == 'binary' && $kindOfBinary ) {
00522                                 return $proposed;
00523                         }
00524                         if ( $proposedFormat == 'html' ) {
00525                                 return $proposed;
00526                         }
00527                 }
00528 
00529                 // Find a MIME type by searching the registry for the file extension.
00530                 $dotPos = strrpos( $fileName, '.' );
00531                 if ( $dotPos === false ) {
00532                         return $type;
00533                 }
00534                 $ext = substr( $fileName, $dotPos );
00535                 if ( isset( $this->registry[$ext] ) ) {
00536                         return $this->registry[$ext];
00537                 }
00538 
00539                 // TODO: If the extension has an application registered to it, IE will return
00540                 // application/octet-stream. We'll skip that, so we could erroneously
00541                 // return text/plain or application/x-netcdf where application/octet-stream
00542                 // would be correct.
00543 
00544                 return $type;
00545         }
00546 
00554         private function checkTextHeaders( $version, $chunk ) {
00555                 $chunk2 = substr( $chunk, 0, 2 );
00556                 $chunk4 = substr( $chunk, 0, 4 );
00557                 $chunk5 = substr( $chunk, 0, 5 );
00558                 if ( $chunk4 == '%PDF' ) {
00559                         return 'application/pdf';
00560                 }
00561                 if ( $chunk2 == '%!' ) {
00562                         return 'application/postscript';
00563                 }
00564                 if ( $chunk5 == '{\\rtf' ) {
00565                         return 'text/richtext';
00566                 }
00567                 if ( $chunk5 == 'begin' ) {
00568                         return 'application/base64';
00569                 }
00570                 return false;
00571         }
00572 
00580         private function checkBinaryHeaders( $version, $chunk ) {
00581                 $chunk2 = substr( $chunk, 0, 2 );
00582                 $chunk3 = substr( $chunk, 0, 3 );
00583                 $chunk4 = substr( $chunk, 0, 4 );
00584                 $chunk5 = substr( $chunk, 0, 5 );
00585                 $chunk5uc = strtoupper( $chunk5 );
00586                 $chunk8 = substr( $chunk, 0, 8 );
00587                 if ( $chunk5uc == 'GIF87' || $chunk5uc == 'GIF89' ) {
00588                         return 'image/gif';
00589                 }
00590                 if ( $chunk2 == "\xff\xd8" ) {
00591                         return 'image/pjpeg'; // actually plain JPEG but this is what IE returns
00592                 }
00593 
00594                 if ( $chunk2 == 'BM'
00595                         && substr( $chunk, 6, 2 ) == "\000\000"
00596                         && substr( $chunk, 8, 2 ) == "\000\000" )
00597                 {
00598                         return 'image/bmp'; // another non-standard MIME
00599                 }
00600                 if ( $chunk4 == 'RIFF'
00601                         && substr( $chunk, 8, 4 ) == 'WAVE' )
00602                 {
00603                         return 'audio/wav';
00604                 }
00605                 // These were integer literals in IE
00606                 // Perhaps the author was not sure what the target endianness was
00607                 if ( $chunk4 == ".sd\000"
00608                         || $chunk4 == ".snd"
00609                         || $chunk4 == "\000ds."
00610                         || $chunk4 == "dns." )
00611                 {
00612                         return 'audio/basic';
00613                 }
00614                 if ( $chunk3 == "MM\000" ) {
00615                         return 'image/tiff';
00616                 }
00617                 if ( $chunk2 == 'MZ' ) {
00618                         return 'application/x-msdownload';
00619                 }
00620                 if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) {
00621                         return 'image/x-png'; // [sic]
00622                 }
00623                 if ( strlen( $chunk ) >= 5 ) {
00624                         $byte2 = ord( $chunk[2] );
00625                         $byte4 = ord( $chunk[4] );
00626                         if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) {
00627                                 return 'image/x-jg';
00628                         }
00629                 }
00630                 // More endian confusion?
00631                 if ( $chunk4 == 'MROF' ) {
00632                         return 'audio/x-aiff';
00633                 }
00634                 $chunk4_8 = substr( $chunk, 8, 4 );
00635                 if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) {
00636                         return 'audio/x-aiff';
00637                 }
00638                 if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) {
00639                         return 'video/avi';
00640                 }
00641                 if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) {
00642                         return 'video/mpeg';
00643                 }
00644                 if ( $chunk4 == "\001\000\000\000"
00645                         && substr( $chunk, 40, 4 ) == ' EMF' )
00646                 {
00647                         return 'image/x-emf';
00648                 }
00649                 if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) {
00650                         return 'image/x-wmf';
00651                 }
00652                 if ( $chunk4 == "\xca\xfe\xba\xbe" ) {
00653                         return 'application/java';
00654                 }
00655                 if ( $chunk2 == 'PK' ) {
00656                         return 'application/x-zip-compressed';
00657                 }
00658                 if ( $chunk2 == "\x1f\x9d" ) {
00659                         return 'application/x-compressed';
00660                 }
00661                 if ( $chunk2 == "\x1f\x8b" ) {
00662                         return 'application/x-gzip-compressed';
00663                 }
00664                 // Skip redundant check for ZIP
00665                 if ( $chunk5 == "MThd\000" ) {
00666                         return 'audio/mid';
00667                 }
00668                 if ( $chunk4 == '%PDF' ) {
00669                         return 'application/pdf';
00670                 }
00671                 return false;
00672         }
00673 
00681         protected function sampleData( $version, $chunk ) {
00682                 $found = array();
00683                 $counters = array(
00684                         'ctrl' => 0,
00685                         'high' => 0,
00686                         'low' => 0,
00687                         'lf' => 0,
00688                         'cr' => 0,
00689                         'ff' => 0
00690                 );
00691                 $htmlTags = array(
00692                         'html',
00693                         'head',
00694                         'title',
00695                         'body',
00696                         'script',
00697                         'a href',
00698                         'pre',
00699                         'img',
00700                         'plaintext',
00701                         'table'
00702                 );
00703                 $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
00704                 $rdfPurl = 'http://purl.org/rss/1.0/';
00705                 $xbmMagic1 = '#define';
00706                 $xbmMagic2 = '_width';
00707                 $xbmMagic3 = '_bits';
00708                 $binhexMagic = 'converted with BinHex';
00709 
00710                 for ( $offset = 0; $offset < strlen( $chunk ); $offset++ ) {
00711                         $curChar = $chunk[$offset];
00712                         if ( $curChar == "\x0a" ) {
00713                                 $counters['lf']++;
00714                                 continue;
00715                         } elseif ( $curChar == "\x0d" ) {
00716                                 $counters['cr']++;
00717                                 continue;
00718                         } elseif ( $curChar == "\x0c" ) {
00719                                 $counters['ff']++;
00720                                 continue;
00721                         } elseif ( $curChar == "\t" ) {
00722                                 $counters['low']++;
00723                                 continue;
00724                         } elseif ( ord( $curChar ) < 32 ) {
00725                                 $counters['ctrl']++;
00726                                 continue;
00727                         } elseif ( ord( $curChar ) >= 128 ) {
00728                                 $counters['high']++;
00729                                 continue;
00730                         }
00731 
00732                         $counters['low']++;
00733                         if ( $curChar == '<' ) {
00734                                 // XML
00735                                 $remainder = substr( $chunk, $offset + 1 );
00736                                 if ( !strncasecmp( $remainder, '?XML', 4 ) ) {
00737                                         $nextChar = substr( $chunk, $offset + 5, 1 );
00738                                         if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) {
00739                                                 $found['xml'] = true;
00740                                         }
00741                                 }
00742                                 // Scriptlet (JSP)
00743                                 if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) {
00744                                         $found['scriptlet'] = true;
00745                                         break;
00746                                 }
00747                                 // HTML
00748                                 foreach ( $htmlTags as $tag ) {
00749                                         if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) {
00750                                                 $found['html'] = true;
00751                                         }
00752                                 }
00753                                 // Skip broken check for additional tags (HR etc.)
00754 
00755                                 // CHANNEL replaced by RSS, RDF and FEED in IE 7
00756                                 if ( $version < 'ie07' ) {
00757                                         if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) {
00758                                                 $found['cdf'] = true;
00759                                         }
00760                                 } else {
00761                                         // RSS
00762                                         if ( !strncasecmp( $remainder, 'RSS', 3 ) ) {
00763                                                 $found['rss'] = true;
00764                                                 break; // return from SampleData
00765                                         }
00766                                         if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) {
00767                                                 $found['rdf-tag'] = true;
00768                                                 // no break
00769                                         }
00770                                         if ( !strncasecmp( $remainder, 'FEED', 4 ) ) {
00771                                                 $found['atom'] = true;
00772                                                 break;
00773                                         }
00774                                 }
00775                                 continue;
00776                         }
00777                         // Skip broken check for -->
00778 
00779                         // RSS URL checks
00780                         // For some reason both URLs must appear before it is recognised
00781                         $remainder = substr( $chunk, $offset );
00782                         if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) {
00783                                 $found['rdf-url'] = true;
00784                                 if ( isset( $found['rdf-tag'] )
00785                                         && isset( $found['rdf-purl'] ) ) // [sic]
00786                                 {
00787                                         break;
00788                                 }
00789                                 continue;
00790                         }
00791 
00792                         if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) {
00793                                 if ( isset( $found['rdf-tag'] )
00794                                         && isset( $found['rdf-url'] ) ) // [sic]
00795                                 {
00796                                         break;
00797                                 }
00798                                 continue;
00799                         }
00800 
00801                         // XBM checks
00802                         if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) {
00803                                 $found['xbm1'] = true;
00804                                 continue;
00805                         }
00806                         if ( $curChar == '_' ) {
00807                                 if ( isset( $found['xbm2'] ) ) {
00808                                         if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) {
00809                                                 $found['xbm'] = true;
00810                                                 break;
00811                                         }
00812                                 } elseif ( isset( $found['xbm1'] ) ) {
00813                                         if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) {
00814                                                 $found['xbm2'] = true;
00815                                         }
00816                                 }
00817                         }
00818 
00819                         // BinHex
00820                         if ( !strncmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) {
00821                                 $found['binhex'] = true;
00822                         }
00823                 }
00824                 return array( 'found' => $found, 'counters' => $counters );
00825         }
00826 
00832         protected function getDataFormat( $version, $type ) {
00833                 $types = $this->typeTable[$version];
00834                 if ( $type == '(null)' || strval( $type ) === '' ) {
00835                         return 'ambiguous';
00836                 }
00837                 foreach ( $types as $format => $list ) {
00838                         if ( in_array( $type, $list ) ) {
00839                                 return $format;
00840                         }
00841                 }
00842                 return 'unknown';
00843         }
00844 }
00845