MediaWiki
REL1_19
|
00001 <?php 00008 class JpegMetadataExtractor { 00009 00010 const MAX_JPEG_SEGMENTS = 200; 00011 // the max segment is a sanity check. 00012 // A jpeg file should never even remotely have 00013 // that many segments. Your average file has about 10. 00014 00026 static function segmentSplitter ( $filename ) { 00027 $showXMP = function_exists( 'xml_parser_create_ns' ); 00028 00029 $segmentCount = 0; 00030 00031 $segments = array( 00032 'XMP_ext' => array(), 00033 'COM' => array(), 00034 'PSIR' => array(), 00035 ); 00036 00037 if ( !$filename ) { 00038 throw new MWException( "No filename specified for " . __METHOD__ ); 00039 } 00040 if ( !file_exists( $filename ) || is_dir( $filename ) ) { 00041 throw new MWException( "Invalid file $filename passed to " . __METHOD__ ); 00042 } 00043 00044 $fh = fopen( $filename, "rb" ); 00045 00046 if ( !$fh ) { 00047 throw new MWException( "Could not open file $filename" ); 00048 } 00049 00050 $buffer = fread( $fh, 2 ); 00051 if ( $buffer !== "\xFF\xD8" ) { 00052 throw new MWException( "Not a jpeg, no SOI" ); 00053 } 00054 while ( !feof( $fh ) ) { 00055 $buffer = fread( $fh, 1 ); 00056 $segmentCount++; 00057 if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) { 00058 // this is just a sanity check 00059 throw new MWException( 'Too many jpeg segments. Aborting' ); 00060 } 00061 if ( $buffer !== "\xFF" ) { 00062 throw new MWException( "Error reading jpeg file marker. Expected 0xFF but got " . bin2hex( $buffer ) ); 00063 } 00064 00065 $buffer = fread( $fh, 1 ); 00066 while( $buffer === "\xFF" && !feof( $fh ) ) { 00067 // Skip through any 0xFF padding bytes. 00068 $buffer = fread( $fh, 1 ); 00069 } 00070 if ( $buffer === "\xFE" ) { 00071 00072 // COM section -- file comment 00073 // First see if valid utf-8, 00074 // if not try to convert it to windows-1252. 00075 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) ); 00076 UtfNormal::quickIsNFCVerify( $com ); 00077 // turns $com to valid utf-8. 00078 // thus if no change, its utf-8, otherwise its something else. 00079 if ( $com !== $oldCom ) { 00080 wfSuppressWarnings(); 00081 $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom ); 00082 wfRestoreWarnings(); 00083 } 00084 // Try it again, if its still not a valid string, then probably 00085 // binary junk or some really weird encoding, so don't extract. 00086 UtfNormal::quickIsNFCVerify( $com ); 00087 if ( $com === $oldCom ) { 00088 $segments["COM"][] = $oldCom; 00089 } else { 00090 wfDebug( __METHOD__ . ' Ignoring JPEG comment as is garbage.' ); 00091 } 00092 00093 } elseif ( $buffer === "\xE1" ) { 00094 // APP1 section (Exif, XMP, and XMP extended) 00095 // only extract if XMP is enabled. 00096 $temp = self::jpegExtractMarker( $fh ); 00097 // check what type of app segment this is. 00098 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) { 00099 $segments["XMP"] = substr( $temp, 29 ); 00100 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) { 00101 $segments["XMP_ext"][] = substr( $temp, 35 ); 00102 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) { 00103 // Some images (especially flickr images) seem to have this. 00104 // I really have no idea what the deal is with them, but 00105 // whatever... 00106 $segments["XMP"] = substr( $temp, 29 ); 00107 wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier ' 00108 . "Using anyways.\n" ); 00109 } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) { 00110 // Just need to find out what the byte order is. 00111 // because php's exif plugin sucks... 00112 // This is a II for little Endian, MM for big. Not a unicode BOM. 00113 $byteOrderMarker = substr( $temp, 6, 2 ); 00114 if ( $byteOrderMarker === 'MM' ) { 00115 $segments['byteOrder'] = 'BE'; 00116 } elseif ( $byteOrderMarker === 'II' ) { 00117 $segments['byteOrder'] = 'LE'; 00118 } else { 00119 wfDebug( __METHOD__ . ' Invalid byte ordering?!' ); 00120 } 00121 } 00122 } elseif ( $buffer === "\xED" ) { 00123 // APP13 - PSIR. IPTC and some photoshop stuff 00124 $temp = self::jpegExtractMarker( $fh ); 00125 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) { 00126 $segments["PSIR"][] = $temp; 00127 } 00128 } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) { 00129 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments 00130 return $segments; 00131 } else { 00132 // segment we don't care about, so skip 00133 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 ); 00134 if ( $size['int'] <= 2 ) throw new MWException( "invalid marker size in jpeg" ); 00135 fseek( $fh, $size['int'] - 2, SEEK_CUR ); 00136 } 00137 00138 } 00139 // shouldn't get here. 00140 throw new MWException( "Reached end of jpeg file unexpectedly" ); 00141 } 00142 00148 private static function jpegExtractMarker( &$fh ) { 00149 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 ); 00150 if ( $size['int'] <= 2 ) throw new MWException( "invalid marker size in jpeg" ); 00151 $segment = fread( $fh, $size['int'] - 2 ); 00152 if ( strlen( $segment ) !== $size['int'] - 2 ) throw new MWException( "Segment shorter than expected" ); 00153 return $segment; 00154 } 00155 00169 public static function doPSIR ( $app13 ) { 00170 if ( !$app13 ) { 00171 throw new MWException( "No App13 segment given" ); 00172 } 00173 // First compare hash with real thing 00174 // 0x404 contains IPTC, 0x425 has hash 00175 // This is used to determine if the iptc is newer than 00176 // the xmp data, as xmp programs update the hash, 00177 // where non-xmp programs don't. 00178 00179 $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked. 00180 $appLen = strlen( $app13 ); 00181 $realHash = ""; 00182 $recordedHash = ""; 00183 00184 // the +12 is the length of an empty item. 00185 while ( $offset + 12 <= $appLen ) { 00186 $valid = true; 00187 if ( substr( $app13, $offset, 4 ) !== '8BIM' ) { 00188 // its supposed to be 8BIM 00189 // but apparently sometimes isn't esp. in 00190 // really old jpg's 00191 $valid = false; 00192 } 00193 $offset += 4; 00194 $id = substr( $app13, $offset, 2 ); 00195 // id is a 2 byte id number which identifies 00196 // the piece of info this record contains. 00197 00198 $offset += 2; 00199 00200 // some record types can contain a name, which 00201 // is a pascal string 0-padded to be an even 00202 // number of bytes. Most times (and any time 00203 // we care) this is empty, making it two null bytes. 00204 00205 $lenName = ord( substr( $app13, $offset, 1 ) ) + 1; 00206 // we never use the name so skip it. +1 for length byte 00207 if ( $lenName % 2 == 1 ) { 00208 $lenName++; 00209 } // pad to even. 00210 $offset += $lenName; 00211 00212 // now length of data (unsigned long big endian) 00213 $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 ); 00214 // PHP can take issue with very large unsigned ints and make them negative. 00215 // Which should never ever happen, as this has to be inside a segment 00216 // which is limited to a 16 bit number. 00217 if ( $lenData['len'] < 0 ) throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' ); 00218 00219 $offset += 4; // 4bytes length field; 00220 00221 // this should not happen, but check. 00222 if ( $lenData['len'] + $offset > $appLen ) { 00223 throw new MWException( "PSIR data too long. (item length=" . $lenData['len'] 00224 . "; offset=$offset; total length=$appLen)" ); 00225 } 00226 00227 if ( $valid ) { 00228 switch ( $id ) { 00229 case "\x04\x04": 00230 // IPTC block 00231 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true ); 00232 break; 00233 case "\x04\x25": 00234 $recordedHash = substr( $app13, $offset, $lenData['len'] ); 00235 break; 00236 } 00237 } 00238 00239 // if odd, add 1 to length to account for 00240 // null pad byte. 00241 if ( $lenData['len'] % 2 == 1 ) $lenData['len']++; 00242 $offset += $lenData['len']; 00243 00244 } 00245 00246 if ( !$realHash || !$recordedHash ) { 00247 return 'iptc-no-hash'; 00248 } elseif ( $realHash === $recordedHash ) { 00249 return 'iptc-good-hash'; 00250 } else { /*$realHash !== $recordedHash */ 00251 return 'iptc-bad-hash'; 00252 } 00253 } 00254 }