MediaWiki
REL1_22
|
00001 <?php 00032 class JpegMetadataExtractor { 00033 00034 const MAX_JPEG_SEGMENTS = 200; 00035 // the max segment is a sanity check. 00036 // A jpeg file should never even remotely have 00037 // that many segments. Your average file has about 10. 00038 00050 static function segmentSplitter( $filename ) { 00051 $showXMP = function_exists( 'xml_parser_create_ns' ); 00052 00053 $segmentCount = 0; 00054 00055 $segments = array( 00056 'XMP_ext' => array(), 00057 'COM' => array(), 00058 'PSIR' => array(), 00059 ); 00060 00061 if ( !$filename ) { 00062 throw new MWException( "No filename specified for " . __METHOD__ ); 00063 } 00064 if ( !file_exists( $filename ) || is_dir( $filename ) ) { 00065 throw new MWException( "Invalid file $filename passed to " . __METHOD__ ); 00066 } 00067 00068 $fh = fopen( $filename, "rb" ); 00069 00070 if ( !$fh ) { 00071 throw new MWException( "Could not open file $filename" ); 00072 } 00073 00074 $buffer = fread( $fh, 2 ); 00075 if ( $buffer !== "\xFF\xD8" ) { 00076 throw new MWException( "Not a jpeg, no SOI" ); 00077 } 00078 while ( !feof( $fh ) ) { 00079 $buffer = fread( $fh, 1 ); 00080 $segmentCount++; 00081 if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) { 00082 // this is just a sanity check 00083 throw new MWException( 'Too many jpeg segments. Aborting' ); 00084 } 00085 if ( $buffer !== "\xFF" ) { 00086 throw new MWException( "Error reading jpeg file marker. Expected 0xFF but got " . bin2hex( $buffer ) ); 00087 } 00088 00089 $buffer = fread( $fh, 1 ); 00090 while ( $buffer === "\xFF" && !feof( $fh ) ) { 00091 // Skip through any 0xFF padding bytes. 00092 $buffer = fread( $fh, 1 ); 00093 } 00094 if ( $buffer === "\xFE" ) { 00095 00096 // COM section -- file comment 00097 // First see if valid utf-8, 00098 // if not try to convert it to windows-1252. 00099 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) ); 00100 UtfNormal::quickIsNFCVerify( $com ); 00101 // turns $com to valid utf-8. 00102 // thus if no change, its utf-8, otherwise its something else. 00103 if ( $com !== $oldCom ) { 00104 wfSuppressWarnings(); 00105 $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom ); 00106 wfRestoreWarnings(); 00107 } 00108 // Try it again, if its still not a valid string, then probably 00109 // binary junk or some really weird encoding, so don't extract. 00110 UtfNormal::quickIsNFCVerify( $com ); 00111 if ( $com === $oldCom ) { 00112 $segments["COM"][] = $oldCom; 00113 } else { 00114 wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage.\n" ); 00115 } 00116 00117 } elseif ( $buffer === "\xE1" ) { 00118 // APP1 section (Exif, XMP, and XMP extended) 00119 // only extract if XMP is enabled. 00120 $temp = self::jpegExtractMarker( $fh ); 00121 // check what type of app segment this is. 00122 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) { 00123 $segments["XMP"] = substr( $temp, 29 ); 00124 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) { 00125 $segments["XMP_ext"][] = substr( $temp, 35 ); 00126 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) { 00127 // Some images (especially flickr images) seem to have this. 00128 // I really have no idea what the deal is with them, but 00129 // whatever... 00130 $segments["XMP"] = substr( $temp, 29 ); 00131 wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier ' 00132 . "Using anyways.\n" ); 00133 } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) { 00134 // Just need to find out what the byte order is. 00135 // because php's exif plugin sucks... 00136 // This is a II for little Endian, MM for big. Not a unicode BOM. 00137 $byteOrderMarker = substr( $temp, 6, 2 ); 00138 if ( $byteOrderMarker === 'MM' ) { 00139 $segments['byteOrder'] = 'BE'; 00140 } elseif ( $byteOrderMarker === 'II' ) { 00141 $segments['byteOrder'] = 'LE'; 00142 } else { 00143 wfDebug( __METHOD__ . " Invalid byte ordering?!\n" ); 00144 } 00145 } 00146 } elseif ( $buffer === "\xED" ) { 00147 // APP13 - PSIR. IPTC and some photoshop stuff 00148 $temp = self::jpegExtractMarker( $fh ); 00149 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) { 00150 $segments["PSIR"][] = $temp; 00151 } 00152 } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) { 00153 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments 00154 return $segments; 00155 } else { 00156 // segment we don't care about, so skip 00157 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 ); 00158 if ( $size['int'] <= 2 ) { 00159 throw new MWException( "invalid marker size in jpeg" ); 00160 } 00161 fseek( $fh, $size['int'] - 2, SEEK_CUR ); 00162 } 00163 00164 } 00165 // shouldn't get here. 00166 throw new MWException( "Reached end of jpeg file unexpectedly" ); 00167 } 00168 00175 private static function jpegExtractMarker( &$fh ) { 00176 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 ); 00177 if ( $size['int'] <= 2 ) { 00178 throw new MWException( "invalid marker size in jpeg" ); 00179 } 00180 $segment = fread( $fh, $size['int'] - 2 ); 00181 if ( strlen( $segment ) !== $size['int'] - 2 ) { 00182 throw new MWException( "Segment shorter than expected" ); 00183 } 00184 return $segment; 00185 } 00186 00200 public static function doPSIR( $app13 ) { 00201 if ( !$app13 ) { 00202 throw new MWException( "No App13 segment given" ); 00203 } 00204 // First compare hash with real thing 00205 // 0x404 contains IPTC, 0x425 has hash 00206 // This is used to determine if the iptc is newer than 00207 // the xmp data, as xmp programs update the hash, 00208 // where non-xmp programs don't. 00209 00210 $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked. 00211 $appLen = strlen( $app13 ); 00212 $realHash = ""; 00213 $recordedHash = ""; 00214 00215 // the +12 is the length of an empty item. 00216 while ( $offset + 12 <= $appLen ) { 00217 $valid = true; 00218 if ( substr( $app13, $offset, 4 ) !== '8BIM' ) { 00219 // its supposed to be 8BIM 00220 // but apparently sometimes isn't esp. in 00221 // really old jpg's 00222 $valid = false; 00223 } 00224 $offset += 4; 00225 $id = substr( $app13, $offset, 2 ); 00226 // id is a 2 byte id number which identifies 00227 // the piece of info this record contains. 00228 00229 $offset += 2; 00230 00231 // some record types can contain a name, which 00232 // is a pascal string 0-padded to be an even 00233 // number of bytes. Most times (and any time 00234 // we care) this is empty, making it two null bytes. 00235 00236 $lenName = ord( substr( $app13, $offset, 1 ) ) + 1; 00237 // we never use the name so skip it. +1 for length byte 00238 if ( $lenName % 2 == 1 ) { 00239 $lenName++; 00240 } // pad to even. 00241 $offset += $lenName; 00242 00243 // now length of data (unsigned long big endian) 00244 $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 ); 00245 // PHP can take issue with very large unsigned ints and make them negative. 00246 // Which should never ever happen, as this has to be inside a segment 00247 // which is limited to a 16 bit number. 00248 if ( $lenData['len'] < 0 ) { 00249 throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' ); 00250 } 00251 00252 $offset += 4; // 4bytes length field; 00253 00254 // this should not happen, but check. 00255 if ( $lenData['len'] + $offset > $appLen ) { 00256 throw new MWException( "PSIR data too long. (item length=" . $lenData['len'] 00257 . "; offset=$offset; total length=$appLen)" ); 00258 } 00259 00260 if ( $valid ) { 00261 switch ( $id ) { 00262 case "\x04\x04": 00263 // IPTC block 00264 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true ); 00265 break; 00266 case "\x04\x25": 00267 $recordedHash = substr( $app13, $offset, $lenData['len'] ); 00268 break; 00269 } 00270 } 00271 00272 // if odd, add 1 to length to account for 00273 // null pad byte. 00274 if ( $lenData['len'] % 2 == 1 ) { 00275 $lenData['len']++; 00276 } 00277 $offset += $lenData['len']; 00278 00279 } 00280 00281 if ( !$realHash || !$recordedHash ) { 00282 return 'iptc-no-hash'; 00283 } elseif ( $realHash === $recordedHash ) { 00284 return 'iptc-good-hash'; 00285 } else { /*$realHash !== $recordedHash */ 00286 return 'iptc-bad-hash'; 00287 } 00288 } 00289 }