MediaWiki  REL1_22
JpegMetadataExtractor.php
Go to the documentation of this file.
00001 <?php
00032 class JpegMetadataExtractor {
00033 
00034     const MAX_JPEG_SEGMENTS = 200;
00035     // the max segment is a sanity check.
00036     // A jpeg file should never even remotely have
00037     // that many segments. Your average file has about 10.
00038 
00050     static function segmentSplitter( $filename ) {
00051         $showXMP = function_exists( 'xml_parser_create_ns' );
00052 
00053         $segmentCount = 0;
00054 
00055         $segments = array(
00056             'XMP_ext' => array(),
00057             'COM' => array(),
00058             'PSIR' => array(),
00059         );
00060 
00061         if ( !$filename ) {
00062             throw new MWException( "No filename specified for " . __METHOD__ );
00063         }
00064         if ( !file_exists( $filename ) || is_dir( $filename ) ) {
00065             throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
00066         }
00067 
00068         $fh = fopen( $filename, "rb" );
00069 
00070         if ( !$fh ) {
00071             throw new MWException( "Could not open file $filename" );
00072         }
00073 
00074         $buffer = fread( $fh, 2 );
00075         if ( $buffer !== "\xFF\xD8" ) {
00076             throw new MWException( "Not a jpeg, no SOI" );
00077         }
00078         while ( !feof( $fh ) ) {
00079             $buffer = fread( $fh, 1 );
00080             $segmentCount++;
00081             if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
00082                 // this is just a sanity check
00083                 throw new MWException( 'Too many jpeg segments. Aborting' );
00084             }
00085             if ( $buffer !== "\xFF" ) {
00086                 throw new MWException( "Error reading jpeg file marker. Expected 0xFF but got " . bin2hex( $buffer ) );
00087             }
00088 
00089             $buffer = fread( $fh, 1 );
00090             while ( $buffer === "\xFF" && !feof( $fh ) ) {
00091                 // Skip through any 0xFF padding bytes.
00092                 $buffer = fread( $fh, 1 );
00093             }
00094             if ( $buffer === "\xFE" ) {
00095 
00096                 // COM section -- file comment
00097                 // First see if valid utf-8,
00098                 // if not try to convert it to windows-1252.
00099                 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
00100                 UtfNormal::quickIsNFCVerify( $com );
00101                 // turns $com to valid utf-8.
00102                 // thus if no change, its utf-8, otherwise its something else.
00103                 if ( $com !== $oldCom ) {
00104                     wfSuppressWarnings();
00105                     $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
00106                     wfRestoreWarnings();
00107                 }
00108                 // Try it again, if its still not a valid string, then probably
00109                 // binary junk or some really weird encoding, so don't extract.
00110                 UtfNormal::quickIsNFCVerify( $com );
00111                 if ( $com === $oldCom ) {
00112                     $segments["COM"][] = $oldCom;
00113                 } else {
00114                     wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage.\n" );
00115                 }
00116 
00117             } elseif ( $buffer === "\xE1" ) {
00118                 // APP1 section (Exif, XMP, and XMP extended)
00119                 // only extract if XMP is enabled.
00120                 $temp = self::jpegExtractMarker( $fh );
00121                 // check what type of app segment this is.
00122                 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
00123                     $segments["XMP"] = substr( $temp, 29 );
00124                 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
00125                     $segments["XMP_ext"][] = substr( $temp, 35 );
00126                 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
00127                     // Some images (especially flickr images) seem to have this.
00128                     // I really have no idea what the deal is with them, but
00129                     // whatever...
00130                     $segments["XMP"] = substr( $temp, 29 );
00131                     wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
00132                         . "Using anyways.\n" );
00133                 } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
00134                     // Just need to find out what the byte order is.
00135                     // because php's exif plugin sucks...
00136                     // This is a II for little Endian, MM for big. Not a unicode BOM.
00137                     $byteOrderMarker = substr( $temp, 6, 2 );
00138                     if ( $byteOrderMarker === 'MM' ) {
00139                         $segments['byteOrder'] = 'BE';
00140                     } elseif ( $byteOrderMarker === 'II' ) {
00141                         $segments['byteOrder'] = 'LE';
00142                     } else {
00143                         wfDebug( __METHOD__ . " Invalid byte ordering?!\n" );
00144                     }
00145                 }
00146             } elseif ( $buffer === "\xED" ) {
00147                 // APP13 - PSIR. IPTC and some photoshop stuff
00148                 $temp = self::jpegExtractMarker( $fh );
00149                 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
00150                     $segments["PSIR"][] = $temp;
00151                 }
00152             } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
00153                 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
00154                 return $segments;
00155             } else {
00156                 // segment we don't care about, so skip
00157                 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
00158                 if ( $size['int'] <= 2 ) {
00159                     throw new MWException( "invalid marker size in jpeg" );
00160                 }
00161                 fseek( $fh, $size['int'] - 2, SEEK_CUR );
00162             }
00163 
00164         }
00165         // shouldn't get here.
00166         throw new MWException( "Reached end of jpeg file unexpectedly" );
00167     }
00168 
00175     private static function jpegExtractMarker( &$fh ) {
00176         $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
00177         if ( $size['int'] <= 2 ) {
00178             throw new MWException( "invalid marker size in jpeg" );
00179         }
00180         $segment = fread( $fh, $size['int'] - 2 );
00181         if ( strlen( $segment ) !== $size['int'] - 2 ) {
00182             throw new MWException( "Segment shorter than expected" );
00183         }
00184         return $segment;
00185     }
00186 
00200     public static function doPSIR( $app13 ) {
00201         if ( !$app13 ) {
00202             throw new MWException( "No App13 segment given" );
00203         }
00204         // First compare hash with real thing
00205         // 0x404 contains IPTC, 0x425 has hash
00206         // This is used to determine if the iptc is newer than
00207         // the xmp data, as xmp programs update the hash,
00208         // where non-xmp programs don't.
00209 
00210         $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
00211         $appLen = strlen( $app13 );
00212         $realHash = "";
00213         $recordedHash = "";
00214 
00215         // the +12 is the length of an empty item.
00216         while ( $offset + 12 <= $appLen ) {
00217             $valid = true;
00218             if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
00219                 // its supposed to be 8BIM
00220                 // but apparently sometimes isn't esp. in
00221                 // really old jpg's
00222                 $valid = false;
00223             }
00224             $offset += 4;
00225             $id = substr( $app13, $offset, 2 );
00226             // id is a 2 byte id number which identifies
00227             // the piece of info this record contains.
00228 
00229             $offset += 2;
00230 
00231             // some record types can contain a name, which
00232             // is a pascal string 0-padded to be an even
00233             // number of bytes. Most times (and any time
00234             // we care) this is empty, making it two null bytes.
00235 
00236             $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
00237             // we never use the name so skip it. +1 for length byte
00238             if ( $lenName % 2 == 1 ) {
00239                 $lenName++;
00240             } // pad to even.
00241             $offset += $lenName;
00242 
00243             // now length of data (unsigned long big endian)
00244             $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
00245             // PHP can take issue with very large unsigned ints and make them negative.
00246             // Which should never ever happen, as this has to be inside a segment
00247             // which is limited to a 16 bit number.
00248             if ( $lenData['len'] < 0 ) {
00249                 throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
00250             }
00251 
00252             $offset += 4; // 4bytes length field;
00253 
00254             // this should not happen, but check.
00255             if ( $lenData['len'] + $offset > $appLen ) {
00256                 throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
00257                     . "; offset=$offset; total length=$appLen)" );
00258             }
00259 
00260             if ( $valid ) {
00261                 switch ( $id ) {
00262                     case "\x04\x04":
00263                         // IPTC block
00264                         $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
00265                         break;
00266                     case "\x04\x25":
00267                         $recordedHash = substr( $app13, $offset, $lenData['len'] );
00268                         break;
00269                 }
00270             }
00271 
00272             // if odd, add 1 to length to account for
00273             // null pad byte.
00274             if ( $lenData['len'] % 2 == 1 ) {
00275                 $lenData['len']++;
00276             }
00277             $offset += $lenData['len'];
00278 
00279         }
00280 
00281         if ( !$realHash || !$recordedHash ) {
00282             return 'iptc-no-hash';
00283         } elseif ( $realHash === $recordedHash ) {
00284             return 'iptc-good-hash';
00285         } else { /*$realHash !== $recordedHash */
00286             return 'iptc-bad-hash';
00287         }
00288     }
00289 }