MediaWiki  REL1_19
IPTC.php
Go to the documentation of this file.
00001 <?php
00006 class IPTC {
00007 
00018         static function parse( $rawData ) {
00019                 $parsed = iptcparse( $rawData );
00020                 $data = Array();
00021                 if (!is_array($parsed)) {
00022                                 return $data;
00023                 }
00024 
00025                 $c = '';
00026                 //charset info contained in tag 1:90.
00027                 if (isset($parsed['1#090']) && isset($parsed['1#090'][0])) {
00028                         $c = self::getCharset($parsed['1#090'][0]);
00029                         if ($c === false) {
00030                                 //Unknown charset. refuse to parse.
00031                                 //note: There is a different between
00032                                 //unknown and no charset specified.
00033                                 return array();
00034                         }
00035                         unset( $parsed['1#090'] );
00036                 }
00037 
00038                 foreach ( $parsed as $tag => $val ) {
00039                         if ( isset( $val[0] ) && trim($val[0]) == '' ) {
00040                                 wfDebugLog('iptc', "IPTC tag $tag had only whitespace as its value.");
00041                                 continue;
00042                         }
00043                         switch( $tag ) {
00044                                 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
00045                                         $data['ImageDescription'] = self::convIPTC( $val, $c );
00046                                         break;
00047                                 case '2#116': /* copyright. Mapped with exif copyright */
00048                                         $data['Copyright'] = self::convIPTC( $val, $c );
00049                                         break;
00050                                 case '2#080': /* byline. Mapped with exif Artist */
00051                                         /* merge with byline title (2:85)
00052                                          * like how exif does it with
00053                                          * Title, person. Not sure if this is best
00054                                          * approach since we no longer have the two fields
00055                                          * separate. each byline title entry corresponds to a
00056                                          * specific byline.                          */
00057 
00058                                         $bylines = self::convIPTC( $val, $c );
00059                                         if ( isset( $parsed['2#085'] ) ) {
00060                                                 $titles = self::convIPTC( $parsed['2#085'], $c );
00061                                         } else {
00062                                                 $titles = array();
00063                                         }
00064 
00065                                         for ( $i = 0; $i < count( $titles ); $i++ ) {
00066                                                 if ( isset( $bylines[$i] ) ) {
00067                                                         // theoretically this should always be set
00068                                                         // but doesn't hurt to be careful.
00069                                                         $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
00070                                                 }
00071                                         }
00072                                         $data['Artist'] = $bylines;
00073                                         break;
00074                                 case '2#025': /* keywords */
00075                                         $data['Keywords'] = self::convIPTC( $val, $c );
00076                                         break;
00077                                 case '2#101': /* Country (shown)*/
00078                                         $data['CountryDest'] = self::convIPTC( $val, $c );
00079                                         break;
00080                                 case '2#095': /* state/province (shown) */
00081                                         $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
00082                                         break;
00083                                 case '2#090': /* city (Shown) */
00084                                         $data['CityDest'] = self::convIPTC( $val, $c );
00085                                         break;
00086                                 case '2#092': /* sublocation (shown) */
00087                                         $data['SublocationDest'] = self::convIPTC( $val, $c );
00088                                         break;
00089                                 case '2#005': /* object name/title */
00090                                         $data['ObjectName'] = self::convIPTC( $val, $c );
00091                                         break;
00092                                 case '2#040': /* special instructions */
00093                                         $data['SpecialInstructions'] = self::convIPTC( $val, $c );
00094                                         break;
00095                                 case '2#105': /* headline*/
00096                                         $data['Headline'] = self::convIPTC( $val, $c );
00097                                         break;
00098                                 case '2#110': /* credit */
00099                                         /*"Identifies the provider of the objectdata,
00100                                          * not necessarily the owner/creator". */
00101                                         $data['Credit'] = self::convIPTC( $val, $c );
00102                                         break;
00103                                 case '2#115': /* source */
00104                                         /* "Identifies the original owner of the intellectual content of the
00105                                          *objectdata. This could be an agency, a member of an agency or
00106                                          *an individual." */
00107                                         $data['Source'] = self::convIPTC( $val, $c );
00108                                         break;
00109 
00110                                 case '2#007': /* edit status (lead, correction, etc) */
00111                                         $data['EditStatus'] = self::convIPTC( $val, $c );
00112                                         break;
00113                                 case '2#015': /* category. deprecated. max 3 letters in theory, often more */
00114                                         $data['iimCategory'] = self::convIPTC( $val, $c );
00115                                         break;
00116                                 case '2#020': /* category. deprecated. */
00117                                         $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
00118                                         break;
00119                                 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
00120                                         $data['Urgency'] = self::convIPTC( $val, $c );
00121                                         break;
00122                                 case '2#022':
00123                                         /* "Identifies objectdata that recurs often and predictably...
00124                                          * Example: Euroweather" */
00125                                         $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
00126                                         break;
00127                                 case '2#026':
00128                                         /* Content location code (iso 3166 + some custom things)
00129                                          * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
00130                                          * See wikipedia article on iso 3166 and appendix D of iim std. */
00131                                         $data['LocationDestCode'] = self::convIPTC( $val, $c );
00132                                         break;
00133                                 case '2#027':
00134                                         /* Content location name. Full printable name
00135                                          * of location of photo. */
00136                                         $data['LocationDest'] = self::convIPTC( $val, $c );
00137                                         break;
00138                                 case '2#065':
00139                                         /* Originating Program.
00140                                          * Combine with Program version (2:70) if present.
00141                                          */
00142                                         $software = self::convIPTC( $val, $c );
00143 
00144                                         if ( count( $software ) !== 1 ) {
00145                                                 //according to iim standard this cannot have multiple values
00146                                                 //so if there is more than one, something weird is happening,
00147                                                 //and we skip it.
00148                                                 wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
00149                                                 break;
00150                                         }
00151 
00152                                         if ( isset( $parsed['2#070'] ) ) {
00153                                                 //if a version is set for the software.
00154                                                 $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
00155                                                 unset($parsed['2#070']);
00156                                                 $data['Software'] = array( array( $software[0], $softwareVersion[0] ) );
00157                                         } else {
00158                                                 $data['Software'] = $software;
00159                                         }
00160                                         break;
00161                                 case '2#075':
00162                                         /* Object cycle.
00163                                          * a for morning (am), p for evening, b for both */
00164                                         $data['ObjectCycle'] = self::convIPTC( $val, $c );
00165                                         break;
00166                                 case '2#100':
00167                                         /* Country/Primary location code.
00168                                          * "Indicates the code of the country/primary location where the
00169                                          * intellectual property of the objectdata was created"
00170                                          * unclear how this differs from 2#026
00171                                          */
00172                                         $data['CountryCodeDest'] = self::convIPTC( $val, $c );
00173                                         break;
00174                                 case '2#103':
00175                                         /* original transmission ref.
00176                                          * "A code representing the location of original transmission ac-
00177                                          * cording to practises of the provider."
00178                                         */
00179                                         $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
00180                                         break;
00181                                 case '2#118': /*contact*/
00182                                         $data['Contact'] = self::convIPTC( $val, $c );
00183                                         break;
00184                                 case '2#122':
00185                                         /* Writer/Editor
00186                                          * "Identification of the name of the person involved in the writing,
00187                                          * editing or correcting the objectdata or caption/abstract."
00188                                          */
00189                                         $data['Writer'] = self::convIPTC( $val, $c );
00190                                         break;
00191                                 case '2#135': /* lang code */
00192                                         $data['LanguageCode'] = self::convIPTC( $val, $c );
00193                                         break;
00194 
00195                                 // Start date stuff.
00196                                 // It doesn't accept incomplete dates even though they are valid
00197                                 // according to spec.
00198                                 // Should potentially store timezone as well.
00199                                 case '2#055':
00200                                         //Date created (not date digitized).
00201                                         //Maps to exif DateTimeOriginal
00202                                         if ( isset( $parsed['2#060'] ) ) {
00203                                                 $time = $parsed['2#060'];
00204                                         } else {
00205                                                 $time = Array();
00206                                         }
00207                                         $timestamp =  self::timeHelper( $val, $time, $c );
00208                                         if ($timestamp) {
00209                                                 $data['DateTimeOriginal'] = $timestamp;
00210                                         }
00211                                         break;
00212 
00213                                 case '2#062':
00214                                         //Date converted to digital representation.
00215                                         //Maps to exif DateTimeDigitized
00216                                         if ( isset( $parsed['2#063'] ) ) {
00217                                                 $time = $parsed['2#063'];
00218                                         } else {
00219                                                 $time = Array();
00220                                         }
00221                                         $timestamp =  self::timeHelper( $val, $time, $c );
00222                                         if ($timestamp) {
00223                                                 $data['DateTimeDigitized'] = $timestamp;
00224                                         }
00225                                         break;
00226 
00227                                 case '2#030':
00228                                         //Date released.
00229                                         if ( isset( $parsed['2#035'] ) ) {
00230                                                 $time = $parsed['2#035'];
00231                                         } else {
00232                                                 $time = Array();
00233                                         }
00234                                         $timestamp =  self::timeHelper( $val, $time, $c );
00235                                         if ($timestamp) {
00236                                                 $data['DateTimeReleased'] = $timestamp;
00237                                         }
00238                                         break;
00239 
00240                                 case '2#037':
00241                                         //Date expires.
00242                                         if ( isset( $parsed['2#038'] ) ) {
00243                                                 $time = $parsed['2#038'];
00244                                         } else {
00245                                                 $time = Array();
00246                                         }
00247                                         $timestamp =  self::timeHelper( $val, $time, $c );
00248                                         if ($timestamp) {
00249                                                 $data['DateTimeExpires'] = $timestamp;
00250                                         }
00251                                         break;
00252 
00253                                 case '2#000': /* iim version */
00254                                         // unlike other tags, this is a 2-byte binary number.
00255                                         //technically this is required if there is iptc data
00256                                         //but in practise it isn't always there.
00257                                         if ( strlen( $val[0] ) == 2 ) {
00258                                                 //if is just to be paranoid.
00259                                                 $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
00260                                                 $versionValue += ord( substr( $val[0], 1, 1 ) );
00261                                                 $data['iimVersion'] = $versionValue;
00262                                         }
00263                                         break;
00264 
00265                                 case '2#004':
00266                                         // IntellectualGenere.
00267                                         // first 4 characters are an id code
00268                                         // That we're not really interested in.
00269 
00270                                         // This prop is weird, since it's
00271                                         // allowed to have multiple values
00272                                         // in iim 4.1, but not in the XMP
00273                                         // stuff. We're going to just
00274                                         // extract the first value.
00275                                         $con = self::ConvIPTC( $val, $c );
00276                                         if ( strlen( $con[0] ) < 5 ) {
00277                                                 wfDebugLog( 'iptc', 'IPTC: '
00278                                                         . '2:04 too short. '
00279                                                         . 'Ignoring.' );
00280                                                         break;
00281                                         }
00282                                         $extracted = substr( $con[0], 4 );
00283                                         $data['IntellectualGenre'] = $extracted;
00284                                         break;
00285 
00286                                 case '2#012':
00287                                         // Subject News code - this is a compound field
00288                                         // at the moment we only extract the subject news
00289                                         // code, which is an 8 digit (ascii) number
00290                                         // describing the subject matter of the content.
00291                                         $codes = self::convIPTC( $val, $c );
00292                                         foreach ( $codes as $ic ) {
00293                                                 $fields = explode(':', $ic, 3 );
00294 
00295                                                 if ( count( $fields ) < 2 ||
00296                                                         $fields[0] !== 'IPTC' )
00297                                                 {
00298                                                         wfDebugLog( 'IPTC', 'IPTC: '
00299                                                                 . 'Invalid 2:12 - ' . $ic );
00300                                                         break;
00301                                                 }
00302                                                 $data['SubjectNewsCode'] = $fields[1];
00303                                         }
00304                                         break;
00305 
00306                                 // purposely does not do 2:125, 2:130, 2:131,
00307                                 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
00308                                 // 2:200, 2:201, 2:202
00309                                 // or the audio stuff (2:150 to 2:154)
00310 
00311                                 case '2#070':
00312                                 case '2#060':
00313                                 case '2#063':
00314                                 case '2#085':
00315                                 case '2#038':
00316                                 case '2#035':
00317                                         //ignore. Handled elsewhere.
00318                                         break;
00319 
00320                                 default:
00321                                         wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ));
00322                                         break;
00323                         }
00324 
00325                 }
00326                 return $data;
00327         }
00328 
00338         private static function timeHelper( $date, $time, $c ) {
00339                 if ( count( $date ) === 1 ) {
00340                         //the standard says this should always be 1
00341                         //just double checking.
00342                         list($date) = self::convIPTC( $date, $c );
00343                 } else {
00344                         return null;
00345                 }
00346 
00347                 if ( count( $time ) === 1 ) {
00348                         list($time) = self::convIPTC( $time, $c );
00349                         $dateOnly = false;
00350                 } else {
00351                         $time = '000000+0000'; //placeholder
00352                         $dateOnly = true;
00353                 }
00354 
00355                 if ( ! ( preg_match('/\d\d\d\d\d\d[-+]\d\d\d\d/', $time)
00356                         && preg_match('/\d\d\d\d\d\d\d\d/', $date)
00357                         && substr($date, 0, 4) !== '0000'
00358                         && substr($date, 4, 2) !== '00'
00359                         && substr($date, 6, 2) !== '00'
00360                  ) ) {
00361                         //something wrong.
00362                         // Note, this rejects some valid dates according to iptc spec
00363                         // for example: the date 00000400 means the photo was taken in
00364                         // April, but the year and day is unknown. We don't process these
00365                         // types of incomplete dates atm.
00366                         wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )");
00367                         return null;
00368                 }
00369 
00370                 $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ));
00371                 if ( $unixTS === false ) {
00372                         wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
00373                         return null;
00374                 }
00375 
00376                 $tz = ( intval( substr( $time, 7, 2 ) ) *60*60 )
00377                         + ( intval( substr( $time, 9, 2 ) ) * 60 );
00378 
00379                 if ( substr( $time, 6, 1 ) === '-' ) {
00380                         $tz = - $tz;
00381                 }
00382 
00383                 $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
00384                 if ( $finalTimestamp === false ) {
00385                         wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
00386                         return null;
00387                 }
00388                 if ( $dateOnly ) {
00389                         //return the date only
00390                         return substr( $finalTimestamp, 0, 10 );
00391                 } else {
00392                         return $finalTimestamp;
00393                 }
00394         }
00395 
00403         private static function convIPTC ( $data, $charset ) {
00404                 if ( is_array( $data ) ) {
00405                         foreach ($data as &$val) {
00406                                 $val = self::convIPTCHelper( $val, $charset );
00407                         }
00408                 } else {
00409                         $data = self::convIPTCHelper( $data, $charset );
00410                 }
00411 
00412                 return $data;
00413         }
00421         private static function convIPTCHelper ( $data, $charset ) {
00422                 if ( $charset ) {
00423                         wfSuppressWarnings();
00424                         $data = iconv($charset, "UTF-8//IGNORE", $data);
00425                         wfRestoreWarnings();
00426                         if ($data === false) {
00427                                 $data = "";
00428                                 wfDebugLog('iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8");
00429                         }
00430                 } else {
00431                         //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
00432                         // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
00433                         $oldData = $data;
00434                         UtfNormal::quickIsNFCVerify( $data ); //make $data valid utf-8
00435                         if ($data === $oldData) {
00436                                 return $data; //if validation didn't change $data
00437                         } else {
00438                                 return self::convIPTCHelper( $oldData, 'Windows-1252' );
00439                         }
00440                 }
00441                 return trim( $data );
00442         }
00443 
00452         static function getCharset($tag) {
00453 
00454                 //According to iim standard, charset is defined by the tag 1:90.
00455                 //in which there are iso 2022 escape sequences to specify the character set.
00456                 //the iim standard seems to encourage that all necessary escape sequences are
00457                 //in the 1:90 tag, but says it doesn't have to be.
00458 
00459                 //This is in need of more testing probably. This is definitely not complete.
00460                 //however reading the docs of some other iptc software, it appears that most iptc software
00461                 //only recognizes utf-8. If 1:90 tag is not present content is
00462                 // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
00463 
00464                 //This also won't work if there are more than one escape sequence in the 1:90 tag
00465                 //or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
00466 
00467                 // This is just going through the charsets mentioned in appendix C of the iim standard.
00468 
00469                 //  \x1b = ESC.
00470                 switch ( $tag ) {
00471                         case "\x1b%G": //utf-8
00472                         //Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
00473                         case "\x1b(B": // ascii
00474                         case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
00475                                 $c = 'UTF-8';
00476                                 break;
00477                         case "\x1b(A": //like ascii, but british.
00478                                 $c = 'ISO646-GB';
00479                                 break;
00480                         case "\x1b(C": //some obscure sweedish/finland encoding
00481                                 $c = 'ISO-IR-8-1';
00482                                 break;
00483                         case "\x1b(D":
00484                                 $c = 'ISO-IR-8-2';
00485                                 break;
00486                         case "\x1b(E": //some obscure danish/norway encoding
00487                                 $c = 'ISO-IR-9-1';
00488                                 break;
00489                         case "\x1b(F":
00490                                 $c = 'ISO-IR-9-2';
00491                                 break;
00492                         case "\x1b(G":
00493                                 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
00494                                 break;
00495                         case "\x1b(I":
00496                                 $c = "ISO646-IT";
00497                                 break;
00498                         case "\x1b(L":
00499                                 $c = "ISO646-PT";
00500                                 break;
00501                         case "\x1b(Z":
00502                                 $c = "ISO646-ES";
00503                                 break;
00504                         case "\x1b([":
00505                                 $c = "GREEK7-OLD";
00506                                 break;
00507                         case "\x1b(K":
00508                                 $c = "ISO646-DE";
00509                                 break;
00510                         case "\x1b(N":  //crylic
00511                                 $c = "ISO_5427";
00512                                 break;
00513                         case "\x1b(`": //iso646-NO
00514                                 $c = "NS_4551-1";
00515                                 break;
00516                         case "\x1b(f": //iso646-FR
00517                                 $c = "NF_Z_62-010";
00518                                 break;
00519                         case "\x1b(g":
00520                                 $c = "PT2"; //iso646-PT2
00521                                 break;
00522                         case "\x1b(h":
00523                                 $c = "ES2";
00524                                 break;
00525                         case "\x1b(i": //iso646-HU
00526                                 $c = "MSZ_7795.3";
00527                                 break;
00528                         case "\x1b(w":
00529                                 $c = "CSA_Z243.4-1985-1";
00530                                 break;
00531                         case "\x1b(x":
00532                                 $c = "CSA_Z243.4-1985-2";
00533                                 break;
00534                         case "\x1b\$(B":
00535                         case "\x1b\$B":
00536                         case "\x1b&@\x1b\$B":
00537                         case "\x1b&@\x1b\$(B":
00538                                 $c = "JIS_C6226-1983";
00539                                 break;
00540                         case "\x1b-A": // iso-8859-1. at least for the high code characters.
00541                         case "\x1b(@\x1b-A":
00542                         case "\x1b(B\x1b-A":
00543                                 $c = 'ISO-8859-1';
00544                                 break;
00545                         case "\x1b-B": // iso-8859-2. at least for the high code characters.
00546                                 $c = 'ISO-8859-2';
00547                                 break;
00548                         case "\x1b-C": // iso-8859-3. at least for the high code characters.
00549                                 $c = 'ISO-8859-3';
00550                                 break;
00551                         case "\x1b-D": // iso-8859-4. at least for the high code characters.
00552                                 $c = 'ISO-8859-4';
00553                                 break;
00554                         case "\x1b-E": // iso-8859-5. at least for the high code characters.
00555                                 $c = 'ISO-8859-5';
00556                                 break;
00557                         case "\x1b-F": // iso-8859-6. at least for the high code characters.
00558                                 $c = 'ISO-8859-6';
00559                                 break;
00560                         case "\x1b-G": // iso-8859-7. at least for the high code characters.
00561                                 $c = 'ISO-8859-7';
00562                                 break;
00563                         case "\x1b-H": // iso-8859-8. at least for the high code characters.
00564                                 $c = 'ISO-8859-8';
00565                                 break;
00566                         case "\x1b-I": // CSN_369103. at least for the high code characters.
00567                                 $c = 'CSN_369103';
00568                                 break;
00569                         default:
00570                                 wfDebugLog('iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
00571                                 //at this point just give up and refuse to parse iptc?
00572                                 $c = false;
00573                 }
00574                 return $c;
00575         }
00576 }