MediaWiki  REL1_22
IEContentAnalyzer.php
Go to the documentation of this file.
00001 <?php
00027 class IEContentAnalyzer {
00031     protected $baseTypeTable = array(
00032         'ambiguous' /*1*/ => array(
00033             'text/plain',
00034             'application/octet-stream',
00035             'application/x-netcdf', // [sic]
00036         ),
00037         'text' /*3*/ => array(
00038             'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64',
00039             'application/macbinhex40', 'application/x-cdf', 'text/scriptlet'
00040         ),
00041         'binary' /*4*/ => array(
00042             'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif',
00043             'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp',
00044             'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi',
00045             'video/x-msvideo', 'video/mpeg', 'application/x-compressed',
00046             'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java',
00047             'application/x-msdownload'
00048         ),
00049         'html' /*5*/ => array( 'text/html' ),
00050     );
00051 
00055     protected $addedTypes = array(
00056         'ie07' => array(
00057             'text' => array( 'text/xml', 'application/xml' )
00058         ),
00059     );
00060 
00067     protected $registry = array(
00068         '.323' => 'text/h323',
00069         '.3g2' => 'video/3gpp2',
00070         '.3gp' => 'video/3gpp',
00071         '.3gp2' => 'video/3gpp2',
00072         '.3gpp' => 'video/3gpp',
00073         '.aac' => 'audio/aac',
00074         '.ac3' => 'audio/ac3',
00075         '.accda' => 'application/msaccess',
00076         '.accdb' => 'application/msaccess',
00077         '.accdc' => 'application/msaccess',
00078         '.accde' => 'application/msaccess',
00079         '.accdr' => 'application/msaccess',
00080         '.accdt' => 'application/msaccess',
00081         '.ade' => 'application/msaccess',
00082         '.adp' => 'application/msaccess',
00083         '.adts' => 'audio/aac',
00084         '.ai' => 'application/postscript',
00085         '.aif' => 'audio/aiff',
00086         '.aifc' => 'audio/aiff',
00087         '.aiff' => 'audio/aiff',
00088         '.amc' => 'application/x-mpeg',
00089         '.application' => 'application/x-ms-application',
00090         '.asf' => 'video/x-ms-asf',
00091         '.asx' => 'video/x-ms-asf',
00092         '.au' => 'audio/basic',
00093         '.avi' => 'video/avi',
00094         '.bmp' => 'image/bmp',
00095         '.caf' => 'audio/x-caf',
00096         '.cat' => 'application/vnd.ms-pki.seccat',
00097         '.cbo' => 'application/sha',
00098         '.cdda' => 'audio/aiff',
00099         '.cer' => 'application/x-x509-ca-cert',
00100         '.conf' => 'text/plain',
00101         '.crl' => 'application/pkix-crl',
00102         '.crt' => 'application/x-x509-ca-cert',
00103         '.css' => 'text/css',
00104         '.csv' => 'application/vnd.ms-excel',
00105         '.der' => 'application/x-x509-ca-cert',
00106         '.dib' => 'image/bmp',
00107         '.dif' => 'video/x-dv',
00108         '.dll' => 'application/x-msdownload',
00109         '.doc' => 'application/msword',
00110         '.docm' => 'application/vnd.ms-word.document.macroEnabled.12',
00111         '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
00112         '.dot' => 'application/msword',
00113         '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12',
00114         '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
00115         '.dv' => 'video/x-dv',
00116         '.dwfx' => 'model/vnd.dwfx+xps',
00117         '.edn' => 'application/vnd.adobe.edn',
00118         '.eml' => 'message/rfc822',
00119         '.eps' => 'application/postscript',
00120         '.etd' => 'application/x-ebx',
00121         '.exe' => 'application/x-msdownload',
00122         '.fdf' => 'application/vnd.fdf',
00123         '.fif' => 'application/fractals',
00124         '.gif' => 'image/gif',
00125         '.gsm' => 'audio/x-gsm',
00126         '.hqx' => 'application/mac-binhex40',
00127         '.hta' => 'application/hta',
00128         '.htc' => 'text/x-component',
00129         '.htm' => 'text/html',
00130         '.html' => 'text/html',
00131         '.htt' => 'text/webviewhtml',
00132         '.hxa' => 'application/xml',
00133         '.hxc' => 'application/xml',
00134         '.hxd' => 'application/octet-stream',
00135         '.hxe' => 'application/xml',
00136         '.hxf' => 'application/xml',
00137         '.hxh' => 'application/octet-stream',
00138         '.hxi' => 'application/octet-stream',
00139         '.hxk' => 'application/xml',
00140         '.hxq' => 'application/octet-stream',
00141         '.hxr' => 'application/octet-stream',
00142         '.hxs' => 'application/octet-stream',
00143         '.hxt' => 'application/xml',
00144         '.hxv' => 'application/xml',
00145         '.hxw' => 'application/octet-stream',
00146         '.ico' => 'image/x-icon',
00147         '.iii' => 'application/x-iphone',
00148         '.ins' => 'application/x-internet-signup',
00149         '.iqy' => 'text/x-ms-iqy',
00150         '.isp' => 'application/x-internet-signup',
00151         '.jfif' => 'image/jpeg',
00152         '.jnlp' => 'application/x-java-jnlp-file',
00153         '.jpe' => 'image/jpeg',
00154         '.jpeg' => 'image/jpeg',
00155         '.jpg' => 'image/jpeg',
00156         '.jtx' => 'application/x-jtx+xps',
00157         '.latex' => 'application/x-latex',
00158         '.log' => 'text/plain',
00159         '.m1v' => 'video/mpeg',
00160         '.m2v' => 'video/mpeg',
00161         '.m3u' => 'audio/x-mpegurl',
00162         '.mac' => 'image/x-macpaint',
00163         '.man' => 'application/x-troff-man',
00164         '.mda' => 'application/msaccess',
00165         '.mdb' => 'application/msaccess',
00166         '.mde' => 'application/msaccess',
00167         '.mfp' => 'application/x-shockwave-flash',
00168         '.mht' => 'message/rfc822',
00169         '.mhtml' => 'message/rfc822',
00170         '.mid' => 'audio/mid',
00171         '.midi' => 'audio/mid',
00172         '.mod' => 'video/mpeg',
00173         '.mov' => 'video/quicktime',
00174         '.mp2' => 'video/mpeg',
00175         '.mp2v' => 'video/mpeg',
00176         '.mp3' => 'audio/mpeg',
00177         '.mp4' => 'video/mp4',
00178         '.mpa' => 'video/mpeg',
00179         '.mpe' => 'video/mpeg',
00180         '.mpeg' => 'video/mpeg',
00181         '.mpf' => 'application/vnd.ms-mediapackage',
00182         '.mpg' => 'video/mpeg',
00183         '.mpv2' => 'video/mpeg',
00184         '.mqv' => 'video/quicktime',
00185         '.NMW' => 'application/nmwb',
00186         '.nws' => 'message/rfc822',
00187         '.odc' => 'text/x-ms-odc',
00188         '.ols' => 'application/vnd.ms-publisher',
00189         '.p10' => 'application/pkcs10',
00190         '.p12' => 'application/x-pkcs12',
00191         '.p7b' => 'application/x-pkcs7-certificates',
00192         '.p7c' => 'application/pkcs7-mime',
00193         '.p7m' => 'application/pkcs7-mime',
00194         '.p7r' => 'application/x-pkcs7-certreqresp',
00195         '.p7s' => 'application/pkcs7-signature',
00196         '.pct' => 'image/pict',
00197         '.pdf' => 'application/pdf',
00198         '.pdx' => 'application/vnd.adobe.pdx',
00199         '.pfx' => 'application/x-pkcs12',
00200         '.pic' => 'image/pict',
00201         '.pict' => 'image/pict',
00202         '.pinstall' => 'application/x-picasa-detect',
00203         '.pko' => 'application/vnd.ms-pki.pko',
00204         '.png' => 'image/png',
00205         '.pnt' => 'image/x-macpaint',
00206         '.pntg' => 'image/x-macpaint',
00207         '.pot' => 'application/vnd.ms-powerpoint',
00208         '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12',
00209         '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template',
00210         '.ppa' => 'application/vnd.ms-powerpoint',
00211         '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12',
00212         '.pps' => 'application/vnd.ms-powerpoint',
00213         '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12',
00214         '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
00215         '.ppt' => 'application/vnd.ms-powerpoint',
00216         '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12',
00217         '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
00218         '.prf' => 'application/pics-rules',
00219         '.ps' => 'application/postscript',
00220         '.pub' => 'application/vnd.ms-publisher',
00221         '.pwz' => 'application/vnd.ms-powerpoint',
00222         '.py' => 'text/plain',
00223         '.pyw' => 'text/plain',
00224         '.qht' => 'text/x-html-insertion',
00225         '.qhtm' => 'text/x-html-insertion',
00226         '.qt' => 'video/quicktime',
00227         '.qti' => 'image/x-quicktime',
00228         '.qtif' => 'image/x-quicktime',
00229         '.qtl' => 'application/x-quicktimeplayer',
00230         '.rat' => 'application/rat-file',
00231         '.rmf' => 'application/vnd.adobe.rmf',
00232         '.rmi' => 'audio/mid',
00233         '.rqy' => 'text/x-ms-rqy',
00234         '.rtf' => 'application/msword',
00235         '.sct' => 'text/scriptlet',
00236         '.sd2' => 'audio/x-sd2',
00237         '.sdp' => 'application/sdp',
00238         '.shtml' => 'text/html',
00239         '.sit' => 'application/x-stuffit',
00240         '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12',
00241         '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide',
00242         '.slk' => 'application/vnd.ms-excel',
00243         '.snd' => 'audio/basic',
00244         '.so' => 'application/x-apachemodule',
00245         '.sol' => 'text/plain',
00246         '.sor' => 'text/plain',
00247         '.spc' => 'application/x-pkcs7-certificates',
00248         '.spl' => 'application/futuresplash',
00249         '.sst' => 'application/vnd.ms-pki.certstore',
00250         '.stl' => 'application/vnd.ms-pki.stl',
00251         '.swf' => 'application/x-shockwave-flash',
00252         '.thmx' => 'application/vnd.ms-officetheme',
00253         '.tif' => 'image/tiff',
00254         '.tiff' => 'image/tiff',
00255         '.txt' => 'text/plain',
00256         '.uls' => 'text/iuls',
00257         '.vcf' => 'text/x-vcard',
00258         '.vdx' => 'application/vnd.ms-visio.viewer',
00259         '.vsd' => 'application/vnd.ms-visio.viewer',
00260         '.vss' => 'application/vnd.ms-visio.viewer',
00261         '.vst' => 'application/vnd.ms-visio.viewer',
00262         '.vsx' => 'application/vnd.ms-visio.viewer',
00263         '.vtx' => 'application/vnd.ms-visio.viewer',
00264         '.wav' => 'audio/wav',
00265         '.wax' => 'audio/x-ms-wax',
00266         '.wbk' => 'application/msword',
00267         '.wdp' => 'image/vnd.ms-photo',
00268         '.wiz' => 'application/msword',
00269         '.wm' => 'video/x-ms-wm',
00270         '.wma' => 'audio/x-ms-wma',
00271         '.wmd' => 'application/x-ms-wmd',
00272         '.wmv' => 'video/x-ms-wmv',
00273         '.wmx' => 'video/x-ms-wmx',
00274         '.wmz' => 'application/x-ms-wmz',
00275         '.wpl' => 'application/vnd.ms-wpl',
00276         '.wsc' => 'text/scriptlet',
00277         '.wvx' => 'video/x-ms-wvx',
00278         '.xaml' => 'application/xaml+xml',
00279         '.xbap' => 'application/x-ms-xbap',
00280         '.xdp' => 'application/vnd.adobe.xdp+xml',
00281         '.xfdf' => 'application/vnd.adobe.xfdf',
00282         '.xht' => 'application/xhtml+xml',
00283         '.xhtml' => 'application/xhtml+xml',
00284         '.xla' => 'application/vnd.ms-excel',
00285         '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12',
00286         '.xlk' => 'application/vnd.ms-excel',
00287         '.xll' => 'application/vnd.ms-excel',
00288         '.xlm' => 'application/vnd.ms-excel',
00289         '.xls' => 'application/vnd.ms-excel',
00290         '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
00291         '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12',
00292         '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
00293         '.xlt' => 'application/vnd.ms-excel',
00294         '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12',
00295         '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
00296         '.xlw' => 'application/vnd.ms-excel',
00297         '.xml' => 'text/xml',
00298         '.xps' => 'application/vnd.ms-xpsdocument',
00299         '.xsl' => 'text/xml',
00300     );
00301 
00307     protected $versions = array( 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' );
00308 
00312     protected $typeTable = array();
00313 
00315     function __construct() {
00316         // Construct versioned type arrays from the base type array plus additions
00317         $types = $this->baseTypeTable;
00318         foreach ( $this->versions as $version ) {
00319             if ( isset( $this->addedTypes[$version] ) ) {
00320                 foreach ( $this->addedTypes[$version] as $format => $addedTypes ) {
00321                     $types[$format] = array_merge( $types[$format], $addedTypes );
00322                 }
00323             }
00324             $this->typeTable[$version] = $types;
00325         }
00326     }
00327 
00338     public function getRealMimesFromData( $fileName, $chunk, $proposed ) {
00339         $types = $this->getMimesFromData( $fileName, $chunk, $proposed );
00340         $types = array_map( array( $this, 'translateMimeType' ), $types );
00341         return $types;
00342     }
00343 
00350     public function translateMimeType( $type ) {
00351         static $table = array(
00352             'image/pjpeg' => 'image/jpeg',
00353             'image/x-png' => 'image/png',
00354             'image/x-wmf' => 'application/x-msmetafile',
00355             'image/bmp' => 'image/x-bmp',
00356             'application/x-zip-compressed' => 'application/zip',
00357             'application/x-compressed' => 'application/x-compress',
00358             'application/x-gzip-compressed' => 'application/x-gzip',
00359             'audio/mid' => 'audio/midi',
00360         );
00361         if ( isset( $table[$type] ) ) {
00362             $type = $table[$type];
00363         }
00364         return $type;
00365     }
00366 
00376     public function getMimesFromData( $fileName, $chunk, $proposed ) {
00377         $types = array();
00378         foreach ( $this->versions as $version ) {
00379             $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed );
00380         }
00381         return $types;
00382     }
00383 
00392     protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) {
00393         // Strip text after a semicolon
00394         $semiPos = strpos( $proposed, ';' );
00395         if ( $semiPos !== false ) {
00396             $proposed = substr( $proposed, 0, $semiPos );
00397         }
00398 
00399         $proposedFormat = $this->getDataFormat( $version, $proposed );
00400         if ( $proposedFormat == 'unknown'
00401             && $proposed != 'multipart/mixed'
00402             && $proposed != 'multipart/x-mixed-replace' )
00403         {
00404             return $proposed;
00405         }
00406         if ( strval( $chunk ) === '' ) {
00407             return $proposed;
00408         }
00409 
00410         // Truncate chunk at 255 bytes
00411         $chunk = substr( $chunk, 0, 255 );
00412 
00413         // IE does the Check*Headers() calls last, and instead does the following image
00414         // type checks by directly looking for the magic numbers. What I do here should
00415         // have the same effect since the magic number checks are identical in both cases.
00416         $result = $this->sampleData( $version, $chunk );
00417         $sampleFound = $result['found'];
00418         $counters = $result['counters'];
00419         $binaryType = $this->checkBinaryHeaders( $version, $chunk );
00420         $textType = $this->checkTextHeaders( $version, $chunk );
00421 
00422         if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) {
00423             return 'text/html';
00424         }
00425         if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) {
00426             return 'image/gif';
00427         }
00428         if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' )
00429             && $binaryType == 'image/pjpeg' )
00430         {
00431             return $proposed;
00432         }
00433         // PNG check added in IE 7
00434         if ( $version >= 'ie07'
00435             && ( $proposed == 'image/x-png' || $proposed == 'image/png' )
00436             && $binaryType == 'image/x-png' )
00437         {
00438             return $proposed;
00439         }
00440 
00441         // CDF was removed in IE 7 so it won't be in $sampleFound for later versions
00442         if ( isset( $sampleFound['cdf'] ) ) {
00443             return 'application/x-cdf';
00444         }
00445 
00446         // RSS and Atom were added in IE 7 so they won't be in $sampleFound for
00447         // previous versions
00448         if ( isset( $sampleFound['rss'] ) ) {
00449             return 'application/rss+xml';
00450         }
00451         if ( isset( $sampleFound['rdf-tag'] )
00452             && isset( $sampleFound['rdf-url'] )
00453             && isset( $sampleFound['rdf-purl'] ) )
00454         {
00455             return 'application/rss+xml';
00456         }
00457         if ( isset( $sampleFound['atom'] ) ) {
00458             return 'application/atom+xml';
00459         }
00460 
00461         if ( isset( $sampleFound['xml'] ) ) {
00462             // TODO: I'm not sure under what circumstances this flag is enabled
00463             if ( strpos( $version, 'strict' ) !== false ) {
00464                 if ( $proposed == 'text/html' || $proposed == 'text/xml' ) {
00465                     return 'text/xml';
00466                 }
00467             } else {
00468                 return 'text/xml';
00469             }
00470         }
00471         if ( isset( $sampleFound['html'] ) ) {
00472             // TODO: I'm not sure under what circumstances this flag is enabled
00473             if ( strpos( $version, 'nohtml' ) !== false ) {
00474                 if ( $proposed == 'text/plain' ) {
00475                     return 'text/html';
00476                 }
00477             } else {
00478                 return 'text/html';
00479             }
00480         }
00481         if ( isset( $sampleFound['xbm'] ) ) {
00482             return 'image/x-bitmap';
00483         }
00484         if ( isset( $sampleFound['binhex'] ) ) {
00485             return 'application/macbinhex40';
00486         }
00487         if ( isset( $sampleFound['scriptlet'] ) ) {
00488             if ( strpos( $version, 'strict' ) !== false ) {
00489                 if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) {
00490                     return 'text/scriptlet';
00491                 }
00492             } else {
00493                 return 'text/scriptlet';
00494             }
00495         }
00496 
00497         // Freaky heuristics to determine if the data is text or binary
00498         // The heuristic is of course broken for non-ASCII text
00499         if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] )
00500             < ( $counters['ctrl'] + $counters['high'] ) * 16 )
00501         {
00502             $kindOfBinary = true;
00503             $type = $binaryType ? $binaryType : $textType;
00504             if ( $type === false ) {
00505                 $type = 'application/octet-stream';
00506             }
00507         } else {
00508             $kindOfBinary = false;
00509             $type = $textType ? $textType : $binaryType;
00510             if ( $type === false ) {
00511                 $type = 'text/plain';
00512             }
00513         }
00514 
00515         // Check if the output format is ambiguous
00516         // This generally means that detection failed, real types aren't ambiguous
00517         $detectedFormat = $this->getDataFormat( $version, $type );
00518         if ( $detectedFormat != 'ambiguous' ) {
00519             return $type;
00520         }
00521 
00522         if ( $proposedFormat != 'ambiguous' ) {
00523             // FormatAgreesWithData()
00524             if ( $proposedFormat == 'text' && !$kindOfBinary ) {
00525                 return $proposed;
00526             }
00527             if ( $proposedFormat == 'binary' && $kindOfBinary ) {
00528                 return $proposed;
00529             }
00530             if ( $proposedFormat == 'html' ) {
00531                 return $proposed;
00532             }
00533         }
00534 
00535         // Find a MIME type by searching the registry for the file extension.
00536         $dotPos = strrpos( $fileName, '.' );
00537         if ( $dotPos === false ) {
00538             return $type;
00539         }
00540         $ext = substr( $fileName, $dotPos );
00541         if ( isset( $this->registry[$ext] ) ) {
00542             return $this->registry[$ext];
00543         }
00544 
00545         // TODO: If the extension has an application registered to it, IE will return
00546         // application/octet-stream. We'll skip that, so we could erroneously
00547         // return text/plain or application/x-netcdf where application/octet-stream
00548         // would be correct.
00549 
00550         return $type;
00551     }
00552 
00560     private function checkTextHeaders( $version, $chunk ) {
00561         $chunk2 = substr( $chunk, 0, 2 );
00562         $chunk4 = substr( $chunk, 0, 4 );
00563         $chunk5 = substr( $chunk, 0, 5 );
00564         if ( $chunk4 == '%PDF' ) {
00565             return 'application/pdf';
00566         }
00567         if ( $chunk2 == '%!' ) {
00568             return 'application/postscript';
00569         }
00570         if ( $chunk5 == '{\\rtf' ) {
00571             return 'text/richtext';
00572         }
00573         if ( $chunk5 == 'begin' ) {
00574             return 'application/base64';
00575         }
00576         return false;
00577     }
00578 
00586     private function checkBinaryHeaders( $version, $chunk ) {
00587         $chunk2 = substr( $chunk, 0, 2 );
00588         $chunk3 = substr( $chunk, 0, 3 );
00589         $chunk4 = substr( $chunk, 0, 4 );
00590         $chunk5 = substr( $chunk, 0, 5 );
00591         $chunk5uc = strtoupper( $chunk5 );
00592         $chunk8 = substr( $chunk, 0, 8 );
00593         if ( $chunk5uc == 'GIF87' || $chunk5uc == 'GIF89' ) {
00594             return 'image/gif';
00595         }
00596         if ( $chunk2 == "\xff\xd8" ) {
00597             return 'image/pjpeg'; // actually plain JPEG but this is what IE returns
00598         }
00599 
00600         if ( $chunk2 == 'BM'
00601             && substr( $chunk, 6, 2 ) == "\000\000"
00602             && substr( $chunk, 8, 2 ) == "\000\000" )
00603         {
00604             return 'image/bmp'; // another non-standard MIME
00605         }
00606         if ( $chunk4 == 'RIFF'
00607             && substr( $chunk, 8, 4 ) == 'WAVE' )
00608         {
00609             return 'audio/wav';
00610         }
00611         // These were integer literals in IE
00612         // Perhaps the author was not sure what the target endianness was
00613         if ( $chunk4 == ".sd\000"
00614             || $chunk4 == ".snd"
00615             || $chunk4 == "\000ds."
00616             || $chunk4 == "dns." )
00617         {
00618             return 'audio/basic';
00619         }
00620         if ( $chunk3 == "MM\000" ) {
00621             return 'image/tiff';
00622         }
00623         if ( $chunk2 == 'MZ' ) {
00624             return 'application/x-msdownload';
00625         }
00626         if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) {
00627             return 'image/x-png'; // [sic]
00628         }
00629         if ( strlen( $chunk ) >= 5 ) {
00630             $byte2 = ord( $chunk[2] );
00631             $byte4 = ord( $chunk[4] );
00632             if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) {
00633                 return 'image/x-jg';
00634             }
00635         }
00636         // More endian confusion?
00637         if ( $chunk4 == 'MROF' ) {
00638             return 'audio/x-aiff';
00639         }
00640         $chunk4_8 = substr( $chunk, 8, 4 );
00641         if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) {
00642             return 'audio/x-aiff';
00643         }
00644         if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) {
00645             return 'video/avi';
00646         }
00647         if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) {
00648             return 'video/mpeg';
00649         }
00650         if ( $chunk4 == "\001\000\000\000"
00651             && substr( $chunk, 40, 4 ) == ' EMF' )
00652         {
00653             return 'image/x-emf';
00654         }
00655         if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) {
00656             return 'image/x-wmf';
00657         }
00658         if ( $chunk4 == "\xca\xfe\xba\xbe" ) {
00659             return 'application/java';
00660         }
00661         if ( $chunk2 == 'PK' ) {
00662             return 'application/x-zip-compressed';
00663         }
00664         if ( $chunk2 == "\x1f\x9d" ) {
00665             return 'application/x-compressed';
00666         }
00667         if ( $chunk2 == "\x1f\x8b" ) {
00668             return 'application/x-gzip-compressed';
00669         }
00670         // Skip redundant check for ZIP
00671         if ( $chunk5 == "MThd\000" ) {
00672             return 'audio/mid';
00673         }
00674         if ( $chunk4 == '%PDF' ) {
00675             return 'application/pdf';
00676         }
00677         return false;
00678     }
00679 
00687     protected function sampleData( $version, $chunk ) {
00688         $found = array();
00689         $counters = array(
00690             'ctrl' => 0,
00691             'high' => 0,
00692             'low' => 0,
00693             'lf' => 0,
00694             'cr' => 0,
00695             'ff' => 0
00696         );
00697         $htmlTags = array(
00698             'html',
00699             'head',
00700             'title',
00701             'body',
00702             'script',
00703             'a href',
00704             'pre',
00705             'img',
00706             'plaintext',
00707             'table'
00708         );
00709         $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
00710         $rdfPurl = 'http://purl.org/rss/1.0/';
00711         $xbmMagic1 = '#define';
00712         $xbmMagic2 = '_width';
00713         $xbmMagic3 = '_bits';
00714         $binhexMagic = 'converted with BinHex';
00715 
00716         for ( $offset = 0; $offset < strlen( $chunk ); $offset++ ) {
00717             $curChar = $chunk[$offset];
00718             if ( $curChar == "\x0a" ) {
00719                 $counters['lf']++;
00720                 continue;
00721             } elseif ( $curChar == "\x0d" ) {
00722                 $counters['cr']++;
00723                 continue;
00724             } elseif ( $curChar == "\x0c" ) {
00725                 $counters['ff']++;
00726                 continue;
00727             } elseif ( $curChar == "\t" ) {
00728                 $counters['low']++;
00729                 continue;
00730             } elseif ( ord( $curChar ) < 32 ) {
00731                 $counters['ctrl']++;
00732                 continue;
00733             } elseif ( ord( $curChar ) >= 128 ) {
00734                 $counters['high']++;
00735                 continue;
00736             }
00737 
00738             $counters['low']++;
00739             if ( $curChar == '<' ) {
00740                 // XML
00741                 $remainder = substr( $chunk, $offset + 1 );
00742                 if ( !strncasecmp( $remainder, '?XML', 4 ) ) {
00743                     $nextChar = substr( $chunk, $offset + 5, 1 );
00744                     if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) {
00745                         $found['xml'] = true;
00746                     }
00747                 }
00748                 // Scriptlet (JSP)
00749                 if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) {
00750                     $found['scriptlet'] = true;
00751                     break;
00752                 }
00753                 // HTML
00754                 foreach ( $htmlTags as $tag ) {
00755                     if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) {
00756                         $found['html'] = true;
00757                     }
00758                 }
00759                 // Skip broken check for additional tags (HR etc.)
00760 
00761                 // CHANNEL replaced by RSS, RDF and FEED in IE 7
00762                 if ( $version < 'ie07' ) {
00763                     if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) {
00764                         $found['cdf'] = true;
00765                     }
00766                 } else {
00767                     // RSS
00768                     if ( !strncasecmp( $remainder, 'RSS', 3 ) ) {
00769                         $found['rss'] = true;
00770                         break; // return from SampleData
00771                     }
00772                     if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) {
00773                         $found['rdf-tag'] = true;
00774                         // no break
00775                     }
00776                     if ( !strncasecmp( $remainder, 'FEED', 4 ) ) {
00777                         $found['atom'] = true;
00778                         break;
00779                     }
00780                 }
00781                 continue;
00782             }
00783             // Skip broken check for -->
00784 
00785             // RSS URL checks
00786             // For some reason both URLs must appear before it is recognised
00787             $remainder = substr( $chunk, $offset );
00788             if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) {
00789                 $found['rdf-url'] = true;
00790                 if ( isset( $found['rdf-tag'] )
00791                     && isset( $found['rdf-purl'] ) ) // [sic]
00792                 {
00793                     break;
00794                 }
00795                 continue;
00796             }
00797 
00798             if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) {
00799                 if ( isset( $found['rdf-tag'] )
00800                     && isset( $found['rdf-url'] ) ) // [sic]
00801                 {
00802                     break;
00803                 }
00804                 continue;
00805             }
00806 
00807             // XBM checks
00808             if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) {
00809                 $found['xbm1'] = true;
00810                 continue;
00811             }
00812             if ( $curChar == '_' ) {
00813                 if ( isset( $found['xbm2'] ) ) {
00814                     if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) {
00815                         $found['xbm'] = true;
00816                         break;
00817                     }
00818                 } elseif ( isset( $found['xbm1'] ) ) {
00819                     if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) {
00820                         $found['xbm2'] = true;
00821                     }
00822                 }
00823             }
00824 
00825             // BinHex
00826             if ( !strncmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) {
00827                 $found['binhex'] = true;
00828             }
00829         }
00830         return array( 'found' => $found, 'counters' => $counters );
00831     }
00832 
00838     protected function getDataFormat( $version, $type ) {
00839         $types = $this->typeTable[$version];
00840         if ( $type == '(null)' || strval( $type ) === '' ) {
00841             return 'ambiguous';
00842         }
00843         foreach ( $types as $format => $list ) {
00844             if ( in_array( $type, $list ) ) {
00845                 return $format;
00846             }
00847         }
00848         return 'unknown';
00849     }
00850 }