[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/includes/media/ -> IPTC.php (source)

   1  <?php
   2  /**
   3   * Class for some IPTC functions.
   4   *
   5   * This program is free software; you can redistribute it and/or modify
   6   * it under the terms of the GNU General Public License as published by
   7   * the Free Software Foundation; either version 2 of the License, or
   8   * (at your option) any later version.
   9   *
  10   * This program is distributed in the hope that it will be useful,
  11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13   * GNU General Public License for more details.
  14   *
  15   * You should have received a copy of the GNU General Public License along
  16   * with this program; if not, write to the Free Software Foundation, Inc.,
  17   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18   * http://www.gnu.org/copyleft/gpl.html
  19   *
  20   * @file
  21   * @ingroup Media
  22   */
  23  
  24  /**
  25   * Class for some IPTC functions.
  26   *
  27   * @ingroup Media
  28   */
  29  class IPTC {
  30      /**
  31       * This takes the results of iptcparse() and puts it into a
  32       * form that can be handled by mediawiki. Generally called from
  33       * BitmapMetadataHandler::doApp13.
  34       *
  35       * @see http://www.iptc.org/std/IIM/4.1/specification/IIMV4.1.pdf
  36       *
  37       * @param string $rawData The app13 block from jpeg containing iptc/iim data
  38       * @return array IPTC metadata array
  39       */
  40  	static function parse( $rawData ) {
  41          $parsed = iptcparse( $rawData );
  42          $data = array();
  43          if ( !is_array( $parsed ) ) {
  44              return $data;
  45          }
  46  
  47          $c = '';
  48          //charset info contained in tag 1:90.
  49          if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
  50              $c = self::getCharset( $parsed['1#090'][0] );
  51              if ( $c === false ) {
  52                  //Unknown charset. refuse to parse.
  53                  //note: There is a different between
  54                  //unknown and no charset specified.
  55                  return array();
  56              }
  57              unset( $parsed['1#090'] );
  58          }
  59  
  60          foreach ( $parsed as $tag => $val ) {
  61              if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
  62                  wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
  63                  continue;
  64              }
  65              switch ( $tag ) {
  66                  case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
  67                      $data['ImageDescription'] = self::convIPTC( $val, $c );
  68                      break;
  69                  case '2#116': /* copyright. Mapped with exif copyright */
  70                      $data['Copyright'] = self::convIPTC( $val, $c );
  71                      break;
  72                  case '2#080': /* byline. Mapped with exif Artist */
  73                      /* merge with byline title (2:85)
  74                       * like how exif does it with
  75                       * Title, person. Not sure if this is best
  76                       * approach since we no longer have the two fields
  77                       * separate. each byline title entry corresponds to a
  78                       * specific byline.                          */
  79  
  80                      $bylines = self::convIPTC( $val, $c );
  81                      if ( isset( $parsed['2#085'] ) ) {
  82                          $titles = self::convIPTC( $parsed['2#085'], $c );
  83                      } else {
  84                          $titles = array();
  85                      }
  86  
  87                      $titleCount = count( $titles );
  88                      for ( $i = 0; $i < $titleCount; $i++ ) {
  89                          if ( isset( $bylines[$i] ) ) {
  90                              // theoretically this should always be set
  91                              // but doesn't hurt to be careful.
  92                              $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
  93                          }
  94                      }
  95                      $data['Artist'] = $bylines;
  96                      break;
  97                  case '2#025': /* keywords */
  98                      $data['Keywords'] = self::convIPTC( $val, $c );
  99                      break;
 100                  case '2#101': /* Country (shown)*/
 101                      $data['CountryDest'] = self::convIPTC( $val, $c );
 102                      break;
 103                  case '2#095': /* state/province (shown) */
 104                      $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
 105                      break;
 106                  case '2#090': /* city (Shown) */
 107                      $data['CityDest'] = self::convIPTC( $val, $c );
 108                      break;
 109                  case '2#092': /* sublocation (shown) */
 110                      $data['SublocationDest'] = self::convIPTC( $val, $c );
 111                      break;
 112                  case '2#005': /* object name/title */
 113                      $data['ObjectName'] = self::convIPTC( $val, $c );
 114                      break;
 115                  case '2#040': /* special instructions */
 116                      $data['SpecialInstructions'] = self::convIPTC( $val, $c );
 117                      break;
 118                  case '2#105': /* headline*/
 119                      $data['Headline'] = self::convIPTC( $val, $c );
 120                      break;
 121                  case '2#110': /* credit */
 122                      /*"Identifies the provider of the objectdata,
 123                       * not necessarily the owner/creator". */
 124                      $data['Credit'] = self::convIPTC( $val, $c );
 125                      break;
 126                  case '2#115': /* source */
 127                      /* "Identifies the original owner of the intellectual content of the
 128                       *objectdata. This could be an agency, a member of an agency or
 129                       *an individual." */
 130                      $data['Source'] = self::convIPTC( $val, $c );
 131                      break;
 132  
 133                  case '2#007': /* edit status (lead, correction, etc) */
 134                      $data['EditStatus'] = self::convIPTC( $val, $c );
 135                      break;
 136                  case '2#015': /* category. deprecated. max 3 letters in theory, often more */
 137                      $data['iimCategory'] = self::convIPTC( $val, $c );
 138                      break;
 139                  case '2#020': /* category. deprecated. */
 140                      $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
 141                      break;
 142                  case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
 143                      $data['Urgency'] = self::convIPTC( $val, $c );
 144                      break;
 145                  case '2#022':
 146                      /* "Identifies objectdata that recurs often and predictably...
 147                       * Example: Euroweather" */
 148                      $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
 149                      break;
 150                  case '2#026':
 151                      /* Content location code (iso 3166 + some custom things)
 152                       * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
 153                       * See wikipedia article on iso 3166 and appendix D of iim std. */
 154                      $data['LocationDestCode'] = self::convIPTC( $val, $c );
 155                      break;
 156                  case '2#027':
 157                      /* Content location name. Full printable name
 158                       * of location of photo. */
 159                      $data['LocationDest'] = self::convIPTC( $val, $c );
 160                      break;
 161                  case '2#065':
 162                      /* Originating Program.
 163                       * Combine with Program version (2:70) if present.
 164                       */
 165                      $software = self::convIPTC( $val, $c );
 166  
 167                      if ( count( $software ) !== 1 ) {
 168                          //according to iim standard this cannot have multiple values
 169                          //so if there is more than one, something weird is happening,
 170                          //and we skip it.
 171                          wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
 172                          break;
 173                      }
 174  
 175                      if ( isset( $parsed['2#070'] ) ) {
 176                          //if a version is set for the software.
 177                          $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
 178                          unset( $parsed['2#070'] );
 179                          $data['Software'] = array( array( $software[0], $softwareVersion[0] ) );
 180                      } else {
 181                          $data['Software'] = $software;
 182                      }
 183                      break;
 184                  case '2#075':
 185                      /* Object cycle.
 186                       * a for morning (am), p for evening, b for both */
 187                      $data['ObjectCycle'] = self::convIPTC( $val, $c );
 188                      break;
 189                  case '2#100':
 190                      /* Country/Primary location code.
 191                       * "Indicates the code of the country/primary location where the
 192                       * intellectual property of the objectdata was created"
 193                       * unclear how this differs from 2#026
 194                       */
 195                      $data['CountryCodeDest'] = self::convIPTC( $val, $c );
 196                      break;
 197                  case '2#103':
 198                      /* original transmission ref.
 199                       * "A code representing the location of original transmission ac-
 200                       * cording to practises of the provider."
 201                       */
 202                      $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
 203                      break;
 204                  case '2#118': /*contact*/
 205                      $data['Contact'] = self::convIPTC( $val, $c );
 206                      break;
 207                  case '2#122':
 208                      /* Writer/Editor
 209                       * "Identification of the name of the person involved in the writing,
 210                       * editing or correcting the objectdata or caption/abstract."
 211                       */
 212                      $data['Writer'] = self::convIPTC( $val, $c );
 213                      break;
 214                  case '2#135': /* lang code */
 215                      $data['LanguageCode'] = self::convIPTC( $val, $c );
 216                      break;
 217  
 218                  // Start date stuff.
 219                  // It doesn't accept incomplete dates even though they are valid
 220                  // according to spec.
 221                  // Should potentially store timezone as well.
 222                  case '2#055':
 223                      //Date created (not date digitized).
 224                      //Maps to exif DateTimeOriginal
 225                      if ( isset( $parsed['2#060'] ) ) {
 226                          $time = $parsed['2#060'];
 227                      } else {
 228                          $time = array();
 229                      }
 230                      $timestamp = self::timeHelper( $val, $time, $c );
 231                      if ( $timestamp ) {
 232                          $data['DateTimeOriginal'] = $timestamp;
 233                      }
 234                      break;
 235  
 236                  case '2#062':
 237                      //Date converted to digital representation.
 238                      //Maps to exif DateTimeDigitized
 239                      if ( isset( $parsed['2#063'] ) ) {
 240                          $time = $parsed['2#063'];
 241                      } else {
 242                          $time = array();
 243                      }
 244                      $timestamp = self::timeHelper( $val, $time, $c );
 245                      if ( $timestamp ) {
 246                          $data['DateTimeDigitized'] = $timestamp;
 247                      }
 248                      break;
 249  
 250                  case '2#030':
 251                      //Date released.
 252                      if ( isset( $parsed['2#035'] ) ) {
 253                          $time = $parsed['2#035'];
 254                      } else {
 255                          $time = array();
 256                      }
 257                      $timestamp = self::timeHelper( $val, $time, $c );
 258                      if ( $timestamp ) {
 259                          $data['DateTimeReleased'] = $timestamp;
 260                      }
 261                      break;
 262  
 263                  case '2#037':
 264                      //Date expires.
 265                      if ( isset( $parsed['2#038'] ) ) {
 266                          $time = $parsed['2#038'];
 267                      } else {
 268                          $time = array();
 269                      }
 270                      $timestamp = self::timeHelper( $val, $time, $c );
 271                      if ( $timestamp ) {
 272                          $data['DateTimeExpires'] = $timestamp;
 273                      }
 274                      break;
 275  
 276                  case '2#000': /* iim version */
 277                      // unlike other tags, this is a 2-byte binary number.
 278                      //technically this is required if there is iptc data
 279                      //but in practise it isn't always there.
 280                      if ( strlen( $val[0] ) == 2 ) {
 281                          //if is just to be paranoid.
 282                          $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
 283                          $versionValue += ord( substr( $val[0], 1, 1 ) );
 284                          $data['iimVersion'] = $versionValue;
 285                      }
 286                      break;
 287  
 288                  case '2#004':
 289                      // IntellectualGenere.
 290                      // first 4 characters are an id code
 291                      // That we're not really interested in.
 292  
 293                      // This prop is weird, since it's
 294                      // allowed to have multiple values
 295                      // in iim 4.1, but not in the XMP
 296                      // stuff. We're going to just
 297                      // extract the first value.
 298                      $con = self::ConvIPTC( $val, $c );
 299                      if ( strlen( $con[0] ) < 5 ) {
 300                          wfDebugLog( 'iptc', 'IPTC: '
 301                              . '2:04 too short. '
 302                              . 'Ignoring.' );
 303                          break;
 304                      }
 305                      $extracted = substr( $con[0], 4 );
 306                      $data['IntellectualGenre'] = $extracted;
 307                      break;
 308  
 309                  case '2#012':
 310                      // Subject News code - this is a compound field
 311                      // at the moment we only extract the subject news
 312                      // code, which is an 8 digit (ascii) number
 313                      // describing the subject matter of the content.
 314                      $codes = self::convIPTC( $val, $c );
 315                      foreach ( $codes as $ic ) {
 316                          $fields = explode( ':', $ic, 3 );
 317  
 318                          if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
 319                              wfDebugLog( 'IPTC', 'IPTC: '
 320                                  . 'Invalid 2:12 - ' . $ic );
 321                              break;
 322                          }
 323                          $data['SubjectNewsCode'] = $fields[1];
 324                      }
 325                      break;
 326  
 327                  // purposely does not do 2:125, 2:130, 2:131,
 328                  // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
 329                  // 2:200, 2:201, 2:202
 330                  // or the audio stuff (2:150 to 2:154)
 331  
 332                  case '2#070':
 333                  case '2#060':
 334                  case '2#063':
 335                  case '2#085':
 336                  case '2#038':
 337                  case '2#035':
 338                      //ignore. Handled elsewhere.
 339                      break;
 340  
 341                  default:
 342                      wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
 343                      break;
 344              }
 345          }
 346  
 347          return $data;
 348      }
 349  
 350      /**
 351       * Convert an iptc date and time tags into the exif format
 352       *
 353       * @todo Potentially this should also capture the timezone offset.
 354       * @param array $date The date tag
 355       * @param array $time The time tag
 356       * @param string $c The charset
 357       * @return string Date in EXIF format.
 358       */
 359  	private static function timeHelper( $date, $time, $c ) {
 360          if ( count( $date ) === 1 ) {
 361              //the standard says this should always be 1
 362              //just double checking.
 363              list( $date ) = self::convIPTC( $date, $c );
 364          } else {
 365              return null;
 366          }
 367  
 368          if ( count( $time ) === 1 ) {
 369              list( $time ) = self::convIPTC( $time, $c );
 370              $dateOnly = false;
 371          } else {
 372              $time = '000000+0000'; //placeholder
 373              $dateOnly = true;
 374          }
 375  
 376          if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
 377              && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
 378              && substr( $date, 0, 4 ) !== '0000'
 379              && substr( $date, 4, 2 ) !== '00'
 380              && substr( $date, 6, 2 ) !== '00'
 381          ) ) {
 382              //something wrong.
 383              // Note, this rejects some valid dates according to iptc spec
 384              // for example: the date 00000400 means the photo was taken in
 385              // April, but the year and day is unknown. We don't process these
 386              // types of incomplete dates atm.
 387              wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
 388  
 389              return null;
 390          }
 391  
 392          $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
 393          if ( $unixTS === false ) {
 394              wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
 395  
 396              return null;
 397          }
 398  
 399          $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 )
 400              + ( intval( substr( $time, 9, 2 ) ) * 60 );
 401  
 402          if ( substr( $time, 6, 1 ) === '-' ) {
 403              $tz = -$tz;
 404          }
 405  
 406          $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
 407          if ( $finalTimestamp === false ) {
 408              wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
 409  
 410              return null;
 411          }
 412          if ( $dateOnly ) {
 413              //return the date only
 414              return substr( $finalTimestamp, 0, 10 );
 415          } else {
 416              return $finalTimestamp;
 417          }
 418      }
 419  
 420      /**
 421       * Helper function to convert charset for iptc values.
 422       * @param string|array $data The iptc string
 423       * @param string $charset The charset
 424       *
 425       * @return string|array
 426       */
 427  	private static function convIPTC( $data, $charset ) {
 428          if ( is_array( $data ) ) {
 429              foreach ( $data as &$val ) {
 430                  $val = self::convIPTCHelper( $val, $charset );
 431              }
 432          } else {
 433              $data = self::convIPTCHelper( $data, $charset );
 434          }
 435  
 436          return $data;
 437      }
 438  
 439      /**
 440       * Helper function of a helper function to convert charset for iptc values.
 441       * @param string|array $data The IPTC string
 442       * @param string $charset The charset
 443       *
 444       * @return string
 445       */
 446  	private static function convIPTCHelper( $data, $charset ) {
 447          if ( $charset ) {
 448              wfSuppressWarnings();
 449              $data = iconv( $charset, "UTF-8//IGNORE", $data );
 450              wfRestoreWarnings();
 451              if ( $data === false ) {
 452                  $data = "";
 453                  wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
 454              }
 455          } else {
 456              //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
 457              // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
 458              $oldData = $data;
 459              UtfNormal::quickIsNFCVerify( $data ); //make $data valid utf-8
 460              if ( $data === $oldData ) {
 461                  return $data; //if validation didn't change $data
 462              } else {
 463                  return self::convIPTCHelper( $oldData, 'Windows-1252' );
 464              }
 465          }
 466  
 467          return trim( $data );
 468      }
 469  
 470      /**
 471       * take the value of 1:90 tag and returns a charset
 472       * @param string $tag 1:90 tag.
 473       * @return string Charset name or "?"
 474       * Warning, this function does not (and is not intended to) detect
 475       * all iso 2022 escape codes. In practise, the code for utf-8 is the
 476       * only code that seems to have wide use. It does detect that code.
 477       */
 478  	static function getCharset( $tag ) {
 479  
 480          //According to iim standard, charset is defined by the tag 1:90.
 481          //in which there are iso 2022 escape sequences to specify the character set.
 482          //the iim standard seems to encourage that all necessary escape sequences are
 483          //in the 1:90 tag, but says it doesn't have to be.
 484  
 485          //This is in need of more testing probably. This is definitely not complete.
 486          //however reading the docs of some other iptc software, it appears that most iptc software
 487          //only recognizes utf-8. If 1:90 tag is not present content is
 488          // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
 489  
 490          //This also won't work if there are more than one escape sequence in the 1:90 tag
 491          //or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
 492  
 493          // This is just going through the charsets mentioned in appendix C of the iim standard.
 494  
 495          //  \x1b = ESC.
 496          switch ( $tag ) {
 497              case "\x1b%G": //utf-8
 498              //Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
 499              case "\x1b(B": // ascii
 500              case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
 501                  $c = 'UTF-8';
 502                  break;
 503              case "\x1b(A": //like ascii, but british.
 504                  $c = 'ISO646-GB';
 505                  break;
 506              case "\x1b(C": //some obscure sweedish/finland encoding
 507                  $c = 'ISO-IR-8-1';
 508                  break;
 509              case "\x1b(D":
 510                  $c = 'ISO-IR-8-2';
 511                  break;
 512              case "\x1b(E": //some obscure danish/norway encoding
 513                  $c = 'ISO-IR-9-1';
 514                  break;
 515              case "\x1b(F":
 516                  $c = 'ISO-IR-9-2';
 517                  break;
 518              case "\x1b(G":
 519                  $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
 520                  break;
 521              case "\x1b(I":
 522                  $c = "ISO646-IT";
 523                  break;
 524              case "\x1b(L":
 525                  $c = "ISO646-PT";
 526                  break;
 527              case "\x1b(Z":
 528                  $c = "ISO646-ES";
 529                  break;
 530              case "\x1b([":
 531                  $c = "GREEK7-OLD";
 532                  break;
 533              case "\x1b(K":
 534                  $c = "ISO646-DE";
 535                  break;
 536              case "\x1b(N": //crylic
 537                  $c = "ISO_5427";
 538                  break;
 539              case "\x1b(`": //iso646-NO
 540                  $c = "NS_4551-1";
 541                  break;
 542              case "\x1b(f": //iso646-FR
 543                  $c = "NF_Z_62-010";
 544                  break;
 545              case "\x1b(g":
 546                  $c = "PT2"; //iso646-PT2
 547                  break;
 548              case "\x1b(h":
 549                  $c = "ES2";
 550                  break;
 551              case "\x1b(i": //iso646-HU
 552                  $c = "MSZ_7795.3";
 553                  break;
 554              case "\x1b(w":
 555                  $c = "CSA_Z243.4-1985-1";
 556                  break;
 557              case "\x1b(x":
 558                  $c = "CSA_Z243.4-1985-2";
 559                  break;
 560              case "\x1b\$(B":
 561              case "\x1b\$B":
 562              case "\x1b&@\x1b\$B":
 563              case "\x1b&@\x1b\$(B":
 564                  $c = "JIS_C6226-1983";
 565                  break;
 566              case "\x1b-A": // iso-8859-1. at least for the high code characters.
 567              case "\x1b(@\x1b-A":
 568              case "\x1b(B\x1b-A":
 569                  $c = 'ISO-8859-1';
 570                  break;
 571              case "\x1b-B": // iso-8859-2. at least for the high code characters.
 572                  $c = 'ISO-8859-2';
 573                  break;
 574              case "\x1b-C": // iso-8859-3. at least for the high code characters.
 575                  $c = 'ISO-8859-3';
 576                  break;
 577              case "\x1b-D": // iso-8859-4. at least for the high code characters.
 578                  $c = 'ISO-8859-4';
 579                  break;
 580              case "\x1b-E": // iso-8859-5. at least for the high code characters.
 581                  $c = 'ISO-8859-5';
 582                  break;
 583              case "\x1b-F": // iso-8859-6. at least for the high code characters.
 584                  $c = 'ISO-8859-6';
 585                  break;
 586              case "\x1b-G": // iso-8859-7. at least for the high code characters.
 587                  $c = 'ISO-8859-7';
 588                  break;
 589              case "\x1b-H": // iso-8859-8. at least for the high code characters.
 590                  $c = 'ISO-8859-8';
 591                  break;
 592              case "\x1b-I": // CSN_369103. at least for the high code characters.
 593                  $c = 'CSN_369103';
 594                  break;
 595              default:
 596                  wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
 597                  //at this point just give up and refuse to parse iptc?
 598                  $c = false;
 599          }
 600          return $c;
 601      }
 602  }


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1