MediaWiki  REL1_24
IPTC.php
Go to the documentation of this file.
00001 <?php
00029 class IPTC {
00040     static function parse( $rawData ) {
00041         $parsed = iptcparse( $rawData );
00042         $data = array();
00043         if ( !is_array( $parsed ) ) {
00044             return $data;
00045         }
00046 
00047         $c = '';
00048         //charset info contained in tag 1:90.
00049         if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
00050             $c = self::getCharset( $parsed['1#090'][0] );
00051             if ( $c === false ) {
00052                 //Unknown charset. refuse to parse.
00053                 //note: There is a different between
00054                 //unknown and no charset specified.
00055                 return array();
00056             }
00057             unset( $parsed['1#090'] );
00058         }
00059 
00060         foreach ( $parsed as $tag => $val ) {
00061             if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
00062                 wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
00063                 continue;
00064             }
00065             switch ( $tag ) {
00066                 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
00067                     $data['ImageDescription'] = self::convIPTC( $val, $c );
00068                     break;
00069                 case '2#116': /* copyright. Mapped with exif copyright */
00070                     $data['Copyright'] = self::convIPTC( $val, $c );
00071                     break;
00072                 case '2#080': /* byline. Mapped with exif Artist */
00073                     /* merge with byline title (2:85)
00074                      * like how exif does it with
00075                      * Title, person. Not sure if this is best
00076                      * approach since we no longer have the two fields
00077                      * separate. each byline title entry corresponds to a
00078                      * specific byline.                          */
00079 
00080                     $bylines = self::convIPTC( $val, $c );
00081                     if ( isset( $parsed['2#085'] ) ) {
00082                         $titles = self::convIPTC( $parsed['2#085'], $c );
00083                     } else {
00084                         $titles = array();
00085                     }
00086 
00087                     $titleCount = count( $titles );
00088                     for ( $i = 0; $i < $titleCount; $i++ ) {
00089                         if ( isset( $bylines[$i] ) ) {
00090                             // theoretically this should always be set
00091                             // but doesn't hurt to be careful.
00092                             $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
00093                         }
00094                     }
00095                     $data['Artist'] = $bylines;
00096                     break;
00097                 case '2#025': /* keywords */
00098                     $data['Keywords'] = self::convIPTC( $val, $c );
00099                     break;
00100                 case '2#101': /* Country (shown)*/
00101                     $data['CountryDest'] = self::convIPTC( $val, $c );
00102                     break;
00103                 case '2#095': /* state/province (shown) */
00104                     $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
00105                     break;
00106                 case '2#090': /* city (Shown) */
00107                     $data['CityDest'] = self::convIPTC( $val, $c );
00108                     break;
00109                 case '2#092': /* sublocation (shown) */
00110                     $data['SublocationDest'] = self::convIPTC( $val, $c );
00111                     break;
00112                 case '2#005': /* object name/title */
00113                     $data['ObjectName'] = self::convIPTC( $val, $c );
00114                     break;
00115                 case '2#040': /* special instructions */
00116                     $data['SpecialInstructions'] = self::convIPTC( $val, $c );
00117                     break;
00118                 case '2#105': /* headline*/
00119                     $data['Headline'] = self::convIPTC( $val, $c );
00120                     break;
00121                 case '2#110': /* credit */
00122                     /*"Identifies the provider of the objectdata,
00123                      * not necessarily the owner/creator". */
00124                     $data['Credit'] = self::convIPTC( $val, $c );
00125                     break;
00126                 case '2#115': /* source */
00127                     /* "Identifies the original owner of the intellectual content of the
00128                      *objectdata. This could be an agency, a member of an agency or
00129                      *an individual." */
00130                     $data['Source'] = self::convIPTC( $val, $c );
00131                     break;
00132 
00133                 case '2#007': /* edit status (lead, correction, etc) */
00134                     $data['EditStatus'] = self::convIPTC( $val, $c );
00135                     break;
00136                 case '2#015': /* category. deprecated. max 3 letters in theory, often more */
00137                     $data['iimCategory'] = self::convIPTC( $val, $c );
00138                     break;
00139                 case '2#020': /* category. deprecated. */
00140                     $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
00141                     break;
00142                 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
00143                     $data['Urgency'] = self::convIPTC( $val, $c );
00144                     break;
00145                 case '2#022':
00146                     /* "Identifies objectdata that recurs often and predictably...
00147                      * Example: Euroweather" */
00148                     $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
00149                     break;
00150                 case '2#026':
00151                     /* Content location code (iso 3166 + some custom things)
00152                      * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
00153                      * See wikipedia article on iso 3166 and appendix D of iim std. */
00154                     $data['LocationDestCode'] = self::convIPTC( $val, $c );
00155                     break;
00156                 case '2#027':
00157                     /* Content location name. Full printable name
00158                      * of location of photo. */
00159                     $data['LocationDest'] = self::convIPTC( $val, $c );
00160                     break;
00161                 case '2#065':
00162                     /* Originating Program.
00163                      * Combine with Program version (2:70) if present.
00164                      */
00165                     $software = self::convIPTC( $val, $c );
00166 
00167                     if ( count( $software ) !== 1 ) {
00168                         //according to iim standard this cannot have multiple values
00169                         //so if there is more than one, something weird is happening,
00170                         //and we skip it.
00171                         wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
00172                         break;
00173                     }
00174 
00175                     if ( isset( $parsed['2#070'] ) ) {
00176                         //if a version is set for the software.
00177                         $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
00178                         unset( $parsed['2#070'] );
00179                         $data['Software'] = array( array( $software[0], $softwareVersion[0] ) );
00180                     } else {
00181                         $data['Software'] = $software;
00182                     }
00183                     break;
00184                 case '2#075':
00185                     /* Object cycle.
00186                      * a for morning (am), p for evening, b for both */
00187                     $data['ObjectCycle'] = self::convIPTC( $val, $c );
00188                     break;
00189                 case '2#100':
00190                     /* Country/Primary location code.
00191                      * "Indicates the code of the country/primary location where the
00192                      * intellectual property of the objectdata was created"
00193                      * unclear how this differs from 2#026
00194                      */
00195                     $data['CountryCodeDest'] = self::convIPTC( $val, $c );
00196                     break;
00197                 case '2#103':
00198                     /* original transmission ref.
00199                      * "A code representing the location of original transmission ac-
00200                      * cording to practises of the provider."
00201                      */
00202                     $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
00203                     break;
00204                 case '2#118': /*contact*/
00205                     $data['Contact'] = self::convIPTC( $val, $c );
00206                     break;
00207                 case '2#122':
00208                     /* Writer/Editor
00209                      * "Identification of the name of the person involved in the writing,
00210                      * editing or correcting the objectdata or caption/abstract."
00211                      */
00212                     $data['Writer'] = self::convIPTC( $val, $c );
00213                     break;
00214                 case '2#135': /* lang code */
00215                     $data['LanguageCode'] = self::convIPTC( $val, $c );
00216                     break;
00217 
00218                 // Start date stuff.
00219                 // It doesn't accept incomplete dates even though they are valid
00220                 // according to spec.
00221                 // Should potentially store timezone as well.
00222                 case '2#055':
00223                     //Date created (not date digitized).
00224                     //Maps to exif DateTimeOriginal
00225                     if ( isset( $parsed['2#060'] ) ) {
00226                         $time = $parsed['2#060'];
00227                     } else {
00228                         $time = array();
00229                     }
00230                     $timestamp = self::timeHelper( $val, $time, $c );
00231                     if ( $timestamp ) {
00232                         $data['DateTimeOriginal'] = $timestamp;
00233                     }
00234                     break;
00235 
00236                 case '2#062':
00237                     //Date converted to digital representation.
00238                     //Maps to exif DateTimeDigitized
00239                     if ( isset( $parsed['2#063'] ) ) {
00240                         $time = $parsed['2#063'];
00241                     } else {
00242                         $time = array();
00243                     }
00244                     $timestamp = self::timeHelper( $val, $time, $c );
00245                     if ( $timestamp ) {
00246                         $data['DateTimeDigitized'] = $timestamp;
00247                     }
00248                     break;
00249 
00250                 case '2#030':
00251                     //Date released.
00252                     if ( isset( $parsed['2#035'] ) ) {
00253                         $time = $parsed['2#035'];
00254                     } else {
00255                         $time = array();
00256                     }
00257                     $timestamp = self::timeHelper( $val, $time, $c );
00258                     if ( $timestamp ) {
00259                         $data['DateTimeReleased'] = $timestamp;
00260                     }
00261                     break;
00262 
00263                 case '2#037':
00264                     //Date expires.
00265                     if ( isset( $parsed['2#038'] ) ) {
00266                         $time = $parsed['2#038'];
00267                     } else {
00268                         $time = array();
00269                     }
00270                     $timestamp = self::timeHelper( $val, $time, $c );
00271                     if ( $timestamp ) {
00272                         $data['DateTimeExpires'] = $timestamp;
00273                     }
00274                     break;
00275 
00276                 case '2#000': /* iim version */
00277                     // unlike other tags, this is a 2-byte binary number.
00278                     //technically this is required if there is iptc data
00279                     //but in practise it isn't always there.
00280                     if ( strlen( $val[0] ) == 2 ) {
00281                         //if is just to be paranoid.
00282                         $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
00283                         $versionValue += ord( substr( $val[0], 1, 1 ) );
00284                         $data['iimVersion'] = $versionValue;
00285                     }
00286                     break;
00287 
00288                 case '2#004':
00289                     // IntellectualGenere.
00290                     // first 4 characters are an id code
00291                     // That we're not really interested in.
00292 
00293                     // This prop is weird, since it's
00294                     // allowed to have multiple values
00295                     // in iim 4.1, but not in the XMP
00296                     // stuff. We're going to just
00297                     // extract the first value.
00298                     $con = self::ConvIPTC( $val, $c );
00299                     if ( strlen( $con[0] ) < 5 ) {
00300                         wfDebugLog( 'iptc', 'IPTC: '
00301                             . '2:04 too short. '
00302                             . 'Ignoring.' );
00303                         break;
00304                     }
00305                     $extracted = substr( $con[0], 4 );
00306                     $data['IntellectualGenre'] = $extracted;
00307                     break;
00308 
00309                 case '2#012':
00310                     // Subject News code - this is a compound field
00311                     // at the moment we only extract the subject news
00312                     // code, which is an 8 digit (ascii) number
00313                     // describing the subject matter of the content.
00314                     $codes = self::convIPTC( $val, $c );
00315                     foreach ( $codes as $ic ) {
00316                         $fields = explode( ':', $ic, 3 );
00317 
00318                         if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
00319                             wfDebugLog( 'IPTC', 'IPTC: '
00320                                 . 'Invalid 2:12 - ' . $ic );
00321                             break;
00322                         }
00323                         $data['SubjectNewsCode'] = $fields[1];
00324                     }
00325                     break;
00326 
00327                 // purposely does not do 2:125, 2:130, 2:131,
00328                 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
00329                 // 2:200, 2:201, 2:202
00330                 // or the audio stuff (2:150 to 2:154)
00331 
00332                 case '2#070':
00333                 case '2#060':
00334                 case '2#063':
00335                 case '2#085':
00336                 case '2#038':
00337                 case '2#035':
00338                     //ignore. Handled elsewhere.
00339                     break;
00340 
00341                 default:
00342                     wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
00343                     break;
00344             }
00345         }
00346 
00347         return $data;
00348     }
00349 
00359     private static function timeHelper( $date, $time, $c ) {
00360         if ( count( $date ) === 1 ) {
00361             //the standard says this should always be 1
00362             //just double checking.
00363             list( $date ) = self::convIPTC( $date, $c );
00364         } else {
00365             return null;
00366         }
00367 
00368         if ( count( $time ) === 1 ) {
00369             list( $time ) = self::convIPTC( $time, $c );
00370             $dateOnly = false;
00371         } else {
00372             $time = '000000+0000'; //placeholder
00373             $dateOnly = true;
00374         }
00375 
00376         if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
00377             && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
00378             && substr( $date, 0, 4 ) !== '0000'
00379             && substr( $date, 4, 2 ) !== '00'
00380             && substr( $date, 6, 2 ) !== '00'
00381         ) ) {
00382             //something wrong.
00383             // Note, this rejects some valid dates according to iptc spec
00384             // for example: the date 00000400 means the photo was taken in
00385             // April, but the year and day is unknown. We don't process these
00386             // types of incomplete dates atm.
00387             wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
00388 
00389             return null;
00390         }
00391 
00392         $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
00393         if ( $unixTS === false ) {
00394             wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
00395 
00396             return null;
00397         }
00398 
00399         $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 )
00400             + ( intval( substr( $time, 9, 2 ) ) * 60 );
00401 
00402         if ( substr( $time, 6, 1 ) === '-' ) {
00403             $tz = -$tz;
00404         }
00405 
00406         $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
00407         if ( $finalTimestamp === false ) {
00408             wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
00409 
00410             return null;
00411         }
00412         if ( $dateOnly ) {
00413             //return the date only
00414             return substr( $finalTimestamp, 0, 10 );
00415         } else {
00416             return $finalTimestamp;
00417         }
00418     }
00419 
00427     private static function convIPTC( $data, $charset ) {
00428         if ( is_array( $data ) ) {
00429             foreach ( $data as &$val ) {
00430                 $val = self::convIPTCHelper( $val, $charset );
00431             }
00432         } else {
00433             $data = self::convIPTCHelper( $data, $charset );
00434         }
00435 
00436         return $data;
00437     }
00438 
00446     private static function convIPTCHelper( $data, $charset ) {
00447         if ( $charset ) {
00448             wfSuppressWarnings();
00449             $data = iconv( $charset, "UTF-8//IGNORE", $data );
00450             wfRestoreWarnings();
00451             if ( $data === false ) {
00452                 $data = "";
00453                 wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
00454             }
00455         } else {
00456             //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
00457             // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
00458             $oldData = $data;
00459             UtfNormal::quickIsNFCVerify( $data ); //make $data valid utf-8
00460             if ( $data === $oldData ) {
00461                 return $data; //if validation didn't change $data
00462             } else {
00463                 return self::convIPTCHelper( $oldData, 'Windows-1252' );
00464             }
00465         }
00466 
00467         return trim( $data );
00468     }
00469 
00478     static function getCharset( $tag ) {
00479 
00480         //According to iim standard, charset is defined by the tag 1:90.
00481         //in which there are iso 2022 escape sequences to specify the character set.
00482         //the iim standard seems to encourage that all necessary escape sequences are
00483         //in the 1:90 tag, but says it doesn't have to be.
00484 
00485         //This is in need of more testing probably. This is definitely not complete.
00486         //however reading the docs of some other iptc software, it appears that most iptc software
00487         //only recognizes utf-8. If 1:90 tag is not present content is
00488         // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
00489 
00490         //This also won't work if there are more than one escape sequence in the 1:90 tag
00491         //or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
00492 
00493         // This is just going through the charsets mentioned in appendix C of the iim standard.
00494 
00495         //  \x1b = ESC.
00496         switch ( $tag ) {
00497             case "\x1b%G": //utf-8
00498             //Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
00499             case "\x1b(B": // ascii
00500             case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
00501                 $c = 'UTF-8';
00502                 break;
00503             case "\x1b(A": //like ascii, but british.
00504                 $c = 'ISO646-GB';
00505                 break;
00506             case "\x1b(C": //some obscure sweedish/finland encoding
00507                 $c = 'ISO-IR-8-1';
00508                 break;
00509             case "\x1b(D":
00510                 $c = 'ISO-IR-8-2';
00511                 break;
00512             case "\x1b(E": //some obscure danish/norway encoding
00513                 $c = 'ISO-IR-9-1';
00514                 break;
00515             case "\x1b(F":
00516                 $c = 'ISO-IR-9-2';
00517                 break;
00518             case "\x1b(G":
00519                 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
00520                 break;
00521             case "\x1b(I":
00522                 $c = "ISO646-IT";
00523                 break;
00524             case "\x1b(L":
00525                 $c = "ISO646-PT";
00526                 break;
00527             case "\x1b(Z":
00528                 $c = "ISO646-ES";
00529                 break;
00530             case "\x1b([":
00531                 $c = "GREEK7-OLD";
00532                 break;
00533             case "\x1b(K":
00534                 $c = "ISO646-DE";
00535                 break;
00536             case "\x1b(N": //crylic
00537                 $c = "ISO_5427";
00538                 break;
00539             case "\x1b(`": //iso646-NO
00540                 $c = "NS_4551-1";
00541                 break;
00542             case "\x1b(f": //iso646-FR
00543                 $c = "NF_Z_62-010";
00544                 break;
00545             case "\x1b(g":
00546                 $c = "PT2"; //iso646-PT2
00547                 break;
00548             case "\x1b(h":
00549                 $c = "ES2";
00550                 break;
00551             case "\x1b(i": //iso646-HU
00552                 $c = "MSZ_7795.3";
00553                 break;
00554             case "\x1b(w":
00555                 $c = "CSA_Z243.4-1985-1";
00556                 break;
00557             case "\x1b(x":
00558                 $c = "CSA_Z243.4-1985-2";
00559                 break;
00560             case "\x1b\$(B":
00561             case "\x1b\$B":
00562             case "\x1b&@\x1b\$B":
00563             case "\x1b&@\x1b\$(B":
00564                 $c = "JIS_C6226-1983";
00565                 break;
00566             case "\x1b-A": // iso-8859-1. at least for the high code characters.
00567             case "\x1b(@\x1b-A":
00568             case "\x1b(B\x1b-A":
00569                 $c = 'ISO-8859-1';
00570                 break;
00571             case "\x1b-B": // iso-8859-2. at least for the high code characters.
00572                 $c = 'ISO-8859-2';
00573                 break;
00574             case "\x1b-C": // iso-8859-3. at least for the high code characters.
00575                 $c = 'ISO-8859-3';
00576                 break;
00577             case "\x1b-D": // iso-8859-4. at least for the high code characters.
00578                 $c = 'ISO-8859-4';
00579                 break;
00580             case "\x1b-E": // iso-8859-5. at least for the high code characters.
00581                 $c = 'ISO-8859-5';
00582                 break;
00583             case "\x1b-F": // iso-8859-6. at least for the high code characters.
00584                 $c = 'ISO-8859-6';
00585                 break;
00586             case "\x1b-G": // iso-8859-7. at least for the high code characters.
00587                 $c = 'ISO-8859-7';
00588                 break;
00589             case "\x1b-H": // iso-8859-8. at least for the high code characters.
00590                 $c = 'ISO-8859-8';
00591                 break;
00592             case "\x1b-I": // CSN_369103. at least for the high code characters.
00593                 $c = 'CSN_369103';
00594                 break;
00595             default:
00596                 wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
00597                 //at this point just give up and refuse to parse iptc?
00598                 $c = false;
00599         }
00600         return $c;
00601     }
00602 }