[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/includes/media/ -> JpegMetadataExtractor.php (source)

   1  <?php
   2  /**
   3   * Extraction of JPEG image metadata.
   4   *
   5   * This program is free software; you can redistribute it and/or modify
   6   * it under the terms of the GNU General Public License as published by
   7   * the Free Software Foundation; either version 2 of the License, or
   8   * (at your option) any later version.
   9   *
  10   * This program is distributed in the hope that it will be useful,
  11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13   * GNU General Public License for more details.
  14   *
  15   * You should have received a copy of the GNU General Public License along
  16   * with this program; if not, write to the Free Software Foundation, Inc.,
  17   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18   * http://www.gnu.org/copyleft/gpl.html
  19   *
  20   * @file
  21   * @ingroup Media
  22   */
  23  
  24  /**
  25   * Class for reading jpegs and extracting metadata.
  26   * see also BitmapMetadataHandler.
  27   *
  28   * Based somewhat on GIFMetadataExtractor.
  29   *
  30   * @ingroup Media
  31   */
  32  class JpegMetadataExtractor {
  33      const MAX_JPEG_SEGMENTS = 200;
  34  
  35      // the max segment is a sanity check.
  36      // A jpeg file should never even remotely have
  37      // that many segments. Your average file has about 10.
  38  
  39      /** Function to extract metadata segments of interest from jpeg files
  40       * based on GIFMetadataExtractor.
  41       *
  42       * we can almost use getimagesize to do this
  43       * but gis doesn't support having multiple app1 segments
  44       * and those can't extract xmp on files containing both exif and xmp data
  45       *
  46       * @param string $filename Name of jpeg file
  47       * @return array Array of interesting segments.
  48       * @throws MWException If given invalid file.
  49       */
  50  	static function segmentSplitter( $filename ) {
  51          $showXMP = function_exists( 'xml_parser_create_ns' );
  52  
  53          $segmentCount = 0;
  54  
  55          $segments = array(
  56              'XMP_ext' => array(),
  57              'COM' => array(),
  58              'PSIR' => array(),
  59          );
  60  
  61          if ( !$filename ) {
  62              throw new MWException( "No filename specified for " . __METHOD__ );
  63          }
  64          if ( !file_exists( $filename ) || is_dir( $filename ) ) {
  65              throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
  66          }
  67  
  68          $fh = fopen( $filename, "rb" );
  69  
  70          if ( !$fh ) {
  71              throw new MWException( "Could not open file $filename" );
  72          }
  73  
  74          $buffer = fread( $fh, 2 );
  75          if ( $buffer !== "\xFF\xD8" ) {
  76              throw new MWException( "Not a jpeg, no SOI" );
  77          }
  78          while ( !feof( $fh ) ) {
  79              $buffer = fread( $fh, 1 );
  80              $segmentCount++;
  81              if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
  82                  // this is just a sanity check
  83                  throw new MWException( 'Too many jpeg segments. Aborting' );
  84              }
  85              if ( $buffer !== "\xFF" ) {
  86                  throw new MWException( "Error reading jpeg file marker. " .
  87                      "Expected 0xFF but got " . bin2hex( $buffer ) );
  88              }
  89  
  90              $buffer = fread( $fh, 1 );
  91              while ( $buffer === "\xFF" && !feof( $fh ) ) {
  92                  // Skip through any 0xFF padding bytes.
  93                  $buffer = fread( $fh, 1 );
  94              }
  95              if ( $buffer === "\xFE" ) {
  96  
  97                  // COM section -- file comment
  98                  // First see if valid utf-8,
  99                  // if not try to convert it to windows-1252.
 100                  $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
 101                  UtfNormal::quickIsNFCVerify( $com );
 102                  // turns $com to valid utf-8.
 103                  // thus if no change, its utf-8, otherwise its something else.
 104                  if ( $com !== $oldCom ) {
 105                      wfSuppressWarnings();
 106                      $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
 107                      wfRestoreWarnings();
 108                  }
 109                  // Try it again, if its still not a valid string, then probably
 110                  // binary junk or some really weird encoding, so don't extract.
 111                  UtfNormal::quickIsNFCVerify( $com );
 112                  if ( $com === $oldCom ) {
 113                      $segments["COM"][] = $oldCom;
 114                  } else {
 115                      wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage.\n" );
 116                  }
 117              } elseif ( $buffer === "\xE1" ) {
 118                  // APP1 section (Exif, XMP, and XMP extended)
 119                  // only extract if XMP is enabled.
 120                  $temp = self::jpegExtractMarker( $fh );
 121                  // check what type of app segment this is.
 122                  if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
 123                      $segments["XMP"] = substr( $temp, 29 );
 124                  } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
 125                      $segments["XMP_ext"][] = substr( $temp, 35 );
 126                  } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
 127                      // Some images (especially flickr images) seem to have this.
 128                      // I really have no idea what the deal is with them, but
 129                      // whatever...
 130                      $segments["XMP"] = substr( $temp, 29 );
 131                      wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
 132                          . "Using anyways.\n" );
 133                  } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
 134                      // Just need to find out what the byte order is.
 135                      // because php's exif plugin sucks...
 136                      // This is a II for little Endian, MM for big. Not a unicode BOM.
 137                      $byteOrderMarker = substr( $temp, 6, 2 );
 138                      if ( $byteOrderMarker === 'MM' ) {
 139                          $segments['byteOrder'] = 'BE';
 140                      } elseif ( $byteOrderMarker === 'II' ) {
 141                          $segments['byteOrder'] = 'LE';
 142                      } else {
 143                          wfDebug( __METHOD__ . " Invalid byte ordering?!\n" );
 144                      }
 145                  }
 146              } elseif ( $buffer === "\xED" ) {
 147                  // APP13 - PSIR. IPTC and some photoshop stuff
 148                  $temp = self::jpegExtractMarker( $fh );
 149                  if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
 150                      $segments["PSIR"][] = $temp;
 151                  }
 152              } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
 153                  // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
 154                  return $segments;
 155              } else {
 156                  // segment we don't care about, so skip
 157                  $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
 158                  if ( $size['int'] <= 2 ) {
 159                      throw new MWException( "invalid marker size in jpeg" );
 160                  }
 161                  fseek( $fh, $size['int'] - 2, SEEK_CUR );
 162              }
 163          }
 164          // shouldn't get here.
 165          throw new MWException( "Reached end of jpeg file unexpectedly" );
 166      }
 167  
 168      /**
 169       * Helper function for jpegSegmentSplitter
 170       * @param resource &$fh File handle for JPEG file
 171       * @throws MWException
 172       * @return string Data content of segment.
 173       */
 174  	private static function jpegExtractMarker( &$fh ) {
 175          $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
 176          if ( $size['int'] <= 2 ) {
 177              throw new MWException( "invalid marker size in jpeg" );
 178          }
 179          $segment = fread( $fh, $size['int'] - 2 );
 180          if ( strlen( $segment ) !== $size['int'] - 2 ) {
 181              throw new MWException( "Segment shorter than expected" );
 182          }
 183  
 184          return $segment;
 185      }
 186  
 187      /**
 188       * This reads the photoshop image resource.
 189       * Currently it only compares the iptc/iim hash
 190       * with the stored hash, which is used to determine the precedence
 191       * of the iptc data. In future it may extract some other info, like
 192       * url of copyright license.
 193       *
 194       * This should generally be called by BitmapMetadataHandler::doApp13()
 195       *
 196       * @param string $app13 Photoshop psir app13 block from jpg.
 197       * @throws MWException (It gets caught next level up though)
 198       * @return string If the iptc hash is good or not. One of 'iptc-no-hash',
 199       *   'iptc-good-hash', 'iptc-bad-hash'.
 200       */
 201  	public static function doPSIR( $app13 ) {
 202          if ( !$app13 ) {
 203              throw new MWException( "No App13 segment given" );
 204          }
 205          // First compare hash with real thing
 206          // 0x404 contains IPTC, 0x425 has hash
 207          // This is used to determine if the iptc is newer than
 208          // the xmp data, as xmp programs update the hash,
 209          // where non-xmp programs don't.
 210  
 211          $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
 212          $appLen = strlen( $app13 );
 213          $realHash = "";
 214          $recordedHash = "";
 215  
 216          // the +12 is the length of an empty item.
 217          while ( $offset + 12 <= $appLen ) {
 218              $valid = true;
 219              if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
 220                  // its supposed to be 8BIM
 221                  // but apparently sometimes isn't esp. in
 222                  // really old jpg's
 223                  $valid = false;
 224              }
 225              $offset += 4;
 226              $id = substr( $app13, $offset, 2 );
 227              // id is a 2 byte id number which identifies
 228              // the piece of info this record contains.
 229  
 230              $offset += 2;
 231  
 232              // some record types can contain a name, which
 233              // is a pascal string 0-padded to be an even
 234              // number of bytes. Most times (and any time
 235              // we care) this is empty, making it two null bytes.
 236  
 237              $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
 238              // we never use the name so skip it. +1 for length byte
 239              if ( $lenName % 2 == 1 ) {
 240                  $lenName++;
 241              } // pad to even.
 242              $offset += $lenName;
 243  
 244              // now length of data (unsigned long big endian)
 245              $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
 246              // PHP can take issue with very large unsigned ints and make them negative.
 247              // Which should never ever happen, as this has to be inside a segment
 248              // which is limited to a 16 bit number.
 249              if ( $lenData['len'] < 0 ) {
 250                  throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
 251              }
 252  
 253              $offset += 4; // 4bytes length field;
 254  
 255              // this should not happen, but check.
 256              if ( $lenData['len'] + $offset > $appLen ) {
 257                  throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
 258                      . "; offset=$offset; total length=$appLen)" );
 259              }
 260  
 261              if ( $valid ) {
 262                  switch ( $id ) {
 263                      case "\x04\x04":
 264                          // IPTC block
 265                          $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
 266                          break;
 267                      case "\x04\x25":
 268                          $recordedHash = substr( $app13, $offset, $lenData['len'] );
 269                          break;
 270                  }
 271              }
 272  
 273              // if odd, add 1 to length to account for
 274              // null pad byte.
 275              if ( $lenData['len'] % 2 == 1 ) {
 276                  $lenData['len']++;
 277              }
 278              $offset += $lenData['len'];
 279          }
 280  
 281          if ( !$realHash || !$recordedHash ) {
 282              return 'iptc-no-hash';
 283          } elseif ( $realHash === $recordedHash ) {
 284              return 'iptc-good-hash';
 285          } else { /*$realHash !== $recordedHash */
 286              return 'iptc-bad-hash';
 287          }
 288      }
 289  }


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1