MediaWiki  REL1_19
JpegMetadataExtractor.php
Go to the documentation of this file.
00001 <?php
00008 class JpegMetadataExtractor {
00009 
00010         const MAX_JPEG_SEGMENTS = 200;
00011         // the max segment is a sanity check.
00012         // A jpeg file should never even remotely have
00013         // that many segments. Your average file has about 10.
00014 
00026         static function segmentSplitter ( $filename ) {
00027                 $showXMP = function_exists( 'xml_parser_create_ns' );
00028 
00029                 $segmentCount = 0;
00030 
00031                 $segments = array(
00032                         'XMP_ext' => array(),
00033                         'COM' => array(),
00034                         'PSIR' => array(),
00035                 );
00036 
00037                 if ( !$filename ) {
00038                         throw new MWException( "No filename specified for " . __METHOD__ );
00039                 }
00040                 if ( !file_exists( $filename ) || is_dir( $filename ) ) {
00041                         throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
00042                 }
00043 
00044                 $fh = fopen( $filename, "rb" );
00045 
00046                 if ( !$fh ) {
00047                         throw new MWException( "Could not open file $filename" );
00048                 }
00049 
00050                 $buffer = fread( $fh, 2 );
00051                 if ( $buffer !== "\xFF\xD8" ) {
00052                         throw new MWException( "Not a jpeg, no SOI" );
00053                 }
00054                 while ( !feof( $fh ) ) {
00055                         $buffer = fread( $fh, 1 );
00056                         $segmentCount++;
00057                         if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
00058                                 // this is just a sanity check
00059                                 throw new MWException( 'Too many jpeg segments. Aborting' );
00060                         }
00061                         if ( $buffer !== "\xFF" ) {
00062                                 throw new MWException( "Error reading jpeg file marker. Expected 0xFF but got " . bin2hex( $buffer ) );
00063                         }
00064 
00065                         $buffer = fread( $fh, 1 );
00066                         while( $buffer === "\xFF" && !feof( $fh ) ) {
00067                                 // Skip through any 0xFF padding bytes.
00068                                 $buffer = fread( $fh, 1 );
00069                         }
00070                         if ( $buffer === "\xFE" ) {
00071 
00072                                 // COM section -- file comment
00073                                 // First see if valid utf-8,
00074                                 // if not try to convert it to windows-1252.
00075                                 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
00076                                 UtfNormal::quickIsNFCVerify( $com );
00077                                 // turns $com to valid utf-8.
00078                                 // thus if no change, its utf-8, otherwise its something else.
00079                                 if ( $com !== $oldCom ) {
00080                                         wfSuppressWarnings();
00081                                         $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
00082                                         wfRestoreWarnings();
00083                                 }
00084                                 // Try it again, if its still not a valid string, then probably
00085                                 // binary junk or some really weird encoding, so don't extract.
00086                                 UtfNormal::quickIsNFCVerify( $com );
00087                                 if ( $com === $oldCom ) {
00088                                         $segments["COM"][] = $oldCom;
00089                                 } else {
00090                                         wfDebug( __METHOD__ . ' Ignoring JPEG comment as is garbage.' );
00091                                 }
00092 
00093                         } elseif ( $buffer === "\xE1" ) {
00094                                 // APP1 section (Exif, XMP, and XMP extended)
00095                                 // only extract if XMP is enabled.
00096                                 $temp = self::jpegExtractMarker( $fh );
00097                                 // check what type of app segment this is.
00098                                 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
00099                                         $segments["XMP"] = substr( $temp, 29 );
00100                                 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
00101                                         $segments["XMP_ext"][] = substr( $temp, 35 );
00102                                 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
00103                                         // Some images (especially flickr images) seem to have this.
00104                                         // I really have no idea what the deal is with them, but
00105                                         // whatever...
00106                                         $segments["XMP"] = substr( $temp, 29 );
00107                                         wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
00108                                                 . "Using anyways.\n" ); 
00109                                 } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
00110                                         // Just need to find out what the byte order is.
00111                                         // because php's exif plugin sucks...
00112                                         // This is a II for little Endian, MM for big. Not a unicode BOM.
00113                                         $byteOrderMarker = substr( $temp, 6, 2 );
00114                                         if ( $byteOrderMarker === 'MM' ) {
00115                                                 $segments['byteOrder'] = 'BE';
00116                                         } elseif ( $byteOrderMarker === 'II' ) {
00117                                                 $segments['byteOrder'] = 'LE';
00118                                         } else {
00119                                                 wfDebug( __METHOD__ . ' Invalid byte ordering?!' );
00120                                         }
00121                                 }
00122                         } elseif ( $buffer === "\xED" ) {
00123                                 // APP13 - PSIR. IPTC and some photoshop stuff
00124                                 $temp = self::jpegExtractMarker( $fh );
00125                                 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
00126                                         $segments["PSIR"][] = $temp;
00127                                 }
00128                         } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
00129                                 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
00130                                 return $segments;
00131                         } else {
00132                                 // segment we don't care about, so skip
00133                                 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
00134                                 if ( $size['int'] <= 2 ) throw new MWException( "invalid marker size in jpeg" );
00135                                 fseek( $fh, $size['int'] - 2, SEEK_CUR );
00136                         }
00137 
00138                 }
00139                 // shouldn't get here.
00140                 throw new MWException( "Reached end of jpeg file unexpectedly" );
00141         }
00142 
00148         private static function jpegExtractMarker( &$fh ) {
00149                 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
00150                 if ( $size['int'] <= 2 ) throw new MWException( "invalid marker size in jpeg" );
00151                 $segment = fread( $fh, $size['int'] - 2 );
00152                 if ( strlen( $segment ) !== $size['int'] - 2 ) throw new MWException( "Segment shorter than expected" );
00153                 return $segment;
00154         }
00155 
00169         public static function doPSIR ( $app13 ) {
00170                 if ( !$app13 ) {
00171                         throw new MWException( "No App13 segment given" );
00172                 }
00173                 // First compare hash with real thing
00174                 // 0x404 contains IPTC, 0x425 has hash
00175                 // This is used to determine if the iptc is newer than
00176                 // the xmp data, as xmp programs update the hash,
00177                 // where non-xmp programs don't.
00178 
00179                 $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
00180                 $appLen = strlen( $app13 );
00181                 $realHash = "";
00182                 $recordedHash = "";
00183 
00184                 // the +12 is the length of an empty item.
00185                 while ( $offset + 12 <= $appLen ) {
00186                         $valid = true;
00187                         if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
00188                                 // its supposed to be 8BIM
00189                                 // but apparently sometimes isn't esp. in
00190                                 // really old jpg's
00191                                 $valid = false;
00192                         }
00193                         $offset += 4;
00194                         $id = substr( $app13, $offset, 2 );
00195                         // id is a 2 byte id number which identifies
00196                         // the piece of info this record contains.
00197 
00198                         $offset += 2;
00199 
00200                         // some record types can contain a name, which
00201                         // is a pascal string 0-padded to be an even
00202                         // number of bytes. Most times (and any time
00203                         // we care) this is empty, making it two null bytes.
00204 
00205                         $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
00206                         // we never use the name so skip it. +1 for length byte
00207                         if ( $lenName % 2 == 1 ) {
00208                                 $lenName++;
00209                         } // pad to even.
00210                         $offset += $lenName;
00211 
00212                         // now length of data (unsigned long big endian)
00213                         $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
00214                         // PHP can take issue with very large unsigned ints and make them negative.
00215                         // Which should never ever happen, as this has to be inside a segment
00216                         // which is limited to a 16 bit number.
00217                         if ( $lenData['len'] < 0 ) throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
00218 
00219                         $offset += 4; // 4bytes length field;
00220 
00221                         // this should not happen, but check.
00222                         if ( $lenData['len'] + $offset > $appLen ) {
00223                                 throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
00224                                         . "; offset=$offset; total length=$appLen)" );
00225                         }
00226 
00227                         if ( $valid ) {
00228                                 switch ( $id ) {
00229                                         case "\x04\x04":
00230                                                 // IPTC block
00231                                                 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
00232                                                 break;
00233                                         case "\x04\x25":
00234                                                 $recordedHash = substr( $app13, $offset, $lenData['len'] );
00235                                                 break;
00236                                 }
00237                         }
00238 
00239                         // if odd, add 1 to length to account for
00240                         // null pad byte.
00241                         if ( $lenData['len'] % 2 == 1 ) $lenData['len']++;
00242                         $offset += $lenData['len'];
00243 
00244                 }
00245 
00246                 if ( !$realHash || !$recordedHash ) {
00247                         return 'iptc-no-hash';
00248                 } elseif ( $realHash === $recordedHash ) {
00249                         return 'iptc-good-hash';
00250                 } else { /*$realHash !== $recordedHash */
00251                         return 'iptc-bad-hash';
00252                 }
00253         }
00254 }