MediaWiki
REL1_19
|
00001 <?php 00006 class IPTC { 00007 00018 static function parse( $rawData ) { 00019 $parsed = iptcparse( $rawData ); 00020 $data = Array(); 00021 if (!is_array($parsed)) { 00022 return $data; 00023 } 00024 00025 $c = ''; 00026 //charset info contained in tag 1:90. 00027 if (isset($parsed['1#090']) && isset($parsed['1#090'][0])) { 00028 $c = self::getCharset($parsed['1#090'][0]); 00029 if ($c === false) { 00030 //Unknown charset. refuse to parse. 00031 //note: There is a different between 00032 //unknown and no charset specified. 00033 return array(); 00034 } 00035 unset( $parsed['1#090'] ); 00036 } 00037 00038 foreach ( $parsed as $tag => $val ) { 00039 if ( isset( $val[0] ) && trim($val[0]) == '' ) { 00040 wfDebugLog('iptc', "IPTC tag $tag had only whitespace as its value."); 00041 continue; 00042 } 00043 switch( $tag ) { 00044 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/ 00045 $data['ImageDescription'] = self::convIPTC( $val, $c ); 00046 break; 00047 case '2#116': /* copyright. Mapped with exif copyright */ 00048 $data['Copyright'] = self::convIPTC( $val, $c ); 00049 break; 00050 case '2#080': /* byline. Mapped with exif Artist */ 00051 /* merge with byline title (2:85) 00052 * like how exif does it with 00053 * Title, person. Not sure if this is best 00054 * approach since we no longer have the two fields 00055 * separate. each byline title entry corresponds to a 00056 * specific byline. */ 00057 00058 $bylines = self::convIPTC( $val, $c ); 00059 if ( isset( $parsed['2#085'] ) ) { 00060 $titles = self::convIPTC( $parsed['2#085'], $c ); 00061 } else { 00062 $titles = array(); 00063 } 00064 00065 for ( $i = 0; $i < count( $titles ); $i++ ) { 00066 if ( isset( $bylines[$i] ) ) { 00067 // theoretically this should always be set 00068 // but doesn't hurt to be careful. 00069 $bylines[$i] = $titles[$i] . ', ' . $bylines[$i]; 00070 } 00071 } 00072 $data['Artist'] = $bylines; 00073 break; 00074 case '2#025': /* keywords */ 00075 $data['Keywords'] = self::convIPTC( $val, $c ); 00076 break; 00077 case '2#101': /* Country (shown)*/ 00078 $data['CountryDest'] = self::convIPTC( $val, $c ); 00079 break; 00080 case '2#095': /* state/province (shown) */ 00081 $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c ); 00082 break; 00083 case '2#090': /* city (Shown) */ 00084 $data['CityDest'] = self::convIPTC( $val, $c ); 00085 break; 00086 case '2#092': /* sublocation (shown) */ 00087 $data['SublocationDest'] = self::convIPTC( $val, $c ); 00088 break; 00089 case '2#005': /* object name/title */ 00090 $data['ObjectName'] = self::convIPTC( $val, $c ); 00091 break; 00092 case '2#040': /* special instructions */ 00093 $data['SpecialInstructions'] = self::convIPTC( $val, $c ); 00094 break; 00095 case '2#105': /* headline*/ 00096 $data['Headline'] = self::convIPTC( $val, $c ); 00097 break; 00098 case '2#110': /* credit */ 00099 /*"Identifies the provider of the objectdata, 00100 * not necessarily the owner/creator". */ 00101 $data['Credit'] = self::convIPTC( $val, $c ); 00102 break; 00103 case '2#115': /* source */ 00104 /* "Identifies the original owner of the intellectual content of the 00105 *objectdata. This could be an agency, a member of an agency or 00106 *an individual." */ 00107 $data['Source'] = self::convIPTC( $val, $c ); 00108 break; 00109 00110 case '2#007': /* edit status (lead, correction, etc) */ 00111 $data['EditStatus'] = self::convIPTC( $val, $c ); 00112 break; 00113 case '2#015': /* category. deprecated. max 3 letters in theory, often more */ 00114 $data['iimCategory'] = self::convIPTC( $val, $c ); 00115 break; 00116 case '2#020': /* category. deprecated. */ 00117 $data['iimSupplementalCategory'] = self::convIPTC( $val, $c ); 00118 break; 00119 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/ 00120 $data['Urgency'] = self::convIPTC( $val, $c ); 00121 break; 00122 case '2#022': 00123 /* "Identifies objectdata that recurs often and predictably... 00124 * Example: Euroweather" */ 00125 $data['FixtureIdentifier'] = self::convIPTC( $val, $c ); 00126 break; 00127 case '2#026': 00128 /* Content location code (iso 3166 + some custom things) 00129 * ex: TUR (for turkey), XUN (for UN), XSP (outer space) 00130 * See wikipedia article on iso 3166 and appendix D of iim std. */ 00131 $data['LocationDestCode'] = self::convIPTC( $val, $c ); 00132 break; 00133 case '2#027': 00134 /* Content location name. Full printable name 00135 * of location of photo. */ 00136 $data['LocationDest'] = self::convIPTC( $val, $c ); 00137 break; 00138 case '2#065': 00139 /* Originating Program. 00140 * Combine with Program version (2:70) if present. 00141 */ 00142 $software = self::convIPTC( $val, $c ); 00143 00144 if ( count( $software ) !== 1 ) { 00145 //according to iim standard this cannot have multiple values 00146 //so if there is more than one, something weird is happening, 00147 //and we skip it. 00148 wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' ); 00149 break; 00150 } 00151 00152 if ( isset( $parsed['2#070'] ) ) { 00153 //if a version is set for the software. 00154 $softwareVersion = self::convIPTC( $parsed['2#070'], $c ); 00155 unset($parsed['2#070']); 00156 $data['Software'] = array( array( $software[0], $softwareVersion[0] ) ); 00157 } else { 00158 $data['Software'] = $software; 00159 } 00160 break; 00161 case '2#075': 00162 /* Object cycle. 00163 * a for morning (am), p for evening, b for both */ 00164 $data['ObjectCycle'] = self::convIPTC( $val, $c ); 00165 break; 00166 case '2#100': 00167 /* Country/Primary location code. 00168 * "Indicates the code of the country/primary location where the 00169 * intellectual property of the objectdata was created" 00170 * unclear how this differs from 2#026 00171 */ 00172 $data['CountryCodeDest'] = self::convIPTC( $val, $c ); 00173 break; 00174 case '2#103': 00175 /* original transmission ref. 00176 * "A code representing the location of original transmission ac- 00177 * cording to practises of the provider." 00178 */ 00179 $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c ); 00180 break; 00181 case '2#118': /*contact*/ 00182 $data['Contact'] = self::convIPTC( $val, $c ); 00183 break; 00184 case '2#122': 00185 /* Writer/Editor 00186 * "Identification of the name of the person involved in the writing, 00187 * editing or correcting the objectdata or caption/abstract." 00188 */ 00189 $data['Writer'] = self::convIPTC( $val, $c ); 00190 break; 00191 case '2#135': /* lang code */ 00192 $data['LanguageCode'] = self::convIPTC( $val, $c ); 00193 break; 00194 00195 // Start date stuff. 00196 // It doesn't accept incomplete dates even though they are valid 00197 // according to spec. 00198 // Should potentially store timezone as well. 00199 case '2#055': 00200 //Date created (not date digitized). 00201 //Maps to exif DateTimeOriginal 00202 if ( isset( $parsed['2#060'] ) ) { 00203 $time = $parsed['2#060']; 00204 } else { 00205 $time = Array(); 00206 } 00207 $timestamp = self::timeHelper( $val, $time, $c ); 00208 if ($timestamp) { 00209 $data['DateTimeOriginal'] = $timestamp; 00210 } 00211 break; 00212 00213 case '2#062': 00214 //Date converted to digital representation. 00215 //Maps to exif DateTimeDigitized 00216 if ( isset( $parsed['2#063'] ) ) { 00217 $time = $parsed['2#063']; 00218 } else { 00219 $time = Array(); 00220 } 00221 $timestamp = self::timeHelper( $val, $time, $c ); 00222 if ($timestamp) { 00223 $data['DateTimeDigitized'] = $timestamp; 00224 } 00225 break; 00226 00227 case '2#030': 00228 //Date released. 00229 if ( isset( $parsed['2#035'] ) ) { 00230 $time = $parsed['2#035']; 00231 } else { 00232 $time = Array(); 00233 } 00234 $timestamp = self::timeHelper( $val, $time, $c ); 00235 if ($timestamp) { 00236 $data['DateTimeReleased'] = $timestamp; 00237 } 00238 break; 00239 00240 case '2#037': 00241 //Date expires. 00242 if ( isset( $parsed['2#038'] ) ) { 00243 $time = $parsed['2#038']; 00244 } else { 00245 $time = Array(); 00246 } 00247 $timestamp = self::timeHelper( $val, $time, $c ); 00248 if ($timestamp) { 00249 $data['DateTimeExpires'] = $timestamp; 00250 } 00251 break; 00252 00253 case '2#000': /* iim version */ 00254 // unlike other tags, this is a 2-byte binary number. 00255 //technically this is required if there is iptc data 00256 //but in practise it isn't always there. 00257 if ( strlen( $val[0] ) == 2 ) { 00258 //if is just to be paranoid. 00259 $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256; 00260 $versionValue += ord( substr( $val[0], 1, 1 ) ); 00261 $data['iimVersion'] = $versionValue; 00262 } 00263 break; 00264 00265 case '2#004': 00266 // IntellectualGenere. 00267 // first 4 characters are an id code 00268 // That we're not really interested in. 00269 00270 // This prop is weird, since it's 00271 // allowed to have multiple values 00272 // in iim 4.1, but not in the XMP 00273 // stuff. We're going to just 00274 // extract the first value. 00275 $con = self::ConvIPTC( $val, $c ); 00276 if ( strlen( $con[0] ) < 5 ) { 00277 wfDebugLog( 'iptc', 'IPTC: ' 00278 . '2:04 too short. ' 00279 . 'Ignoring.' ); 00280 break; 00281 } 00282 $extracted = substr( $con[0], 4 ); 00283 $data['IntellectualGenre'] = $extracted; 00284 break; 00285 00286 case '2#012': 00287 // Subject News code - this is a compound field 00288 // at the moment we only extract the subject news 00289 // code, which is an 8 digit (ascii) number 00290 // describing the subject matter of the content. 00291 $codes = self::convIPTC( $val, $c ); 00292 foreach ( $codes as $ic ) { 00293 $fields = explode(':', $ic, 3 ); 00294 00295 if ( count( $fields ) < 2 || 00296 $fields[0] !== 'IPTC' ) 00297 { 00298 wfDebugLog( 'IPTC', 'IPTC: ' 00299 . 'Invalid 2:12 - ' . $ic ); 00300 break; 00301 } 00302 $data['SubjectNewsCode'] = $fields[1]; 00303 } 00304 break; 00305 00306 // purposely does not do 2:125, 2:130, 2:131, 00307 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3 00308 // 2:200, 2:201, 2:202 00309 // or the audio stuff (2:150 to 2:154) 00310 00311 case '2#070': 00312 case '2#060': 00313 case '2#063': 00314 case '2#085': 00315 case '2#038': 00316 case '2#035': 00317 //ignore. Handled elsewhere. 00318 break; 00319 00320 default: 00321 wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val )); 00322 break; 00323 } 00324 00325 } 00326 return $data; 00327 } 00328 00338 private static function timeHelper( $date, $time, $c ) { 00339 if ( count( $date ) === 1 ) { 00340 //the standard says this should always be 1 00341 //just double checking. 00342 list($date) = self::convIPTC( $date, $c ); 00343 } else { 00344 return null; 00345 } 00346 00347 if ( count( $time ) === 1 ) { 00348 list($time) = self::convIPTC( $time, $c ); 00349 $dateOnly = false; 00350 } else { 00351 $time = '000000+0000'; //placeholder 00352 $dateOnly = true; 00353 } 00354 00355 if ( ! ( preg_match('/\d\d\d\d\d\d[-+]\d\d\d\d/', $time) 00356 && preg_match('/\d\d\d\d\d\d\d\d/', $date) 00357 && substr($date, 0, 4) !== '0000' 00358 && substr($date, 4, 2) !== '00' 00359 && substr($date, 6, 2) !== '00' 00360 ) ) { 00361 //something wrong. 00362 // Note, this rejects some valid dates according to iptc spec 00363 // for example: the date 00000400 means the photo was taken in 00364 // April, but the year and day is unknown. We don't process these 00365 // types of incomplete dates atm. 00366 wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )"); 00367 return null; 00368 } 00369 00370 $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 )); 00371 if ( $unixTS === false ) { 00372 wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." ); 00373 return null; 00374 } 00375 00376 $tz = ( intval( substr( $time, 7, 2 ) ) *60*60 ) 00377 + ( intval( substr( $time, 9, 2 ) ) * 60 ); 00378 00379 if ( substr( $time, 6, 1 ) === '-' ) { 00380 $tz = - $tz; 00381 } 00382 00383 $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz ); 00384 if ( $finalTimestamp === false ) { 00385 wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) ); 00386 return null; 00387 } 00388 if ( $dateOnly ) { 00389 //return the date only 00390 return substr( $finalTimestamp, 0, 10 ); 00391 } else { 00392 return $finalTimestamp; 00393 } 00394 } 00395 00403 private static function convIPTC ( $data, $charset ) { 00404 if ( is_array( $data ) ) { 00405 foreach ($data as &$val) { 00406 $val = self::convIPTCHelper( $val, $charset ); 00407 } 00408 } else { 00409 $data = self::convIPTCHelper( $data, $charset ); 00410 } 00411 00412 return $data; 00413 } 00421 private static function convIPTCHelper ( $data, $charset ) { 00422 if ( $charset ) { 00423 wfSuppressWarnings(); 00424 $data = iconv($charset, "UTF-8//IGNORE", $data); 00425 wfRestoreWarnings(); 00426 if ($data === false) { 00427 $data = ""; 00428 wfDebugLog('iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8"); 00429 } 00430 } else { 00431 //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252 00432 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8 00433 $oldData = $data; 00434 UtfNormal::quickIsNFCVerify( $data ); //make $data valid utf-8 00435 if ($data === $oldData) { 00436 return $data; //if validation didn't change $data 00437 } else { 00438 return self::convIPTCHelper( $oldData, 'Windows-1252' ); 00439 } 00440 } 00441 return trim( $data ); 00442 } 00443 00452 static function getCharset($tag) { 00453 00454 //According to iim standard, charset is defined by the tag 1:90. 00455 //in which there are iso 2022 escape sequences to specify the character set. 00456 //the iim standard seems to encourage that all necessary escape sequences are 00457 //in the 1:90 tag, but says it doesn't have to be. 00458 00459 //This is in need of more testing probably. This is definitely not complete. 00460 //however reading the docs of some other iptc software, it appears that most iptc software 00461 //only recognizes utf-8. If 1:90 tag is not present content is 00462 // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee. 00463 00464 //This also won't work if there are more than one escape sequence in the 1:90 tag 00465 //or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8. 00466 00467 // This is just going through the charsets mentioned in appendix C of the iim standard. 00468 00469 // \x1b = ESC. 00470 switch ( $tag ) { 00471 case "\x1b%G": //utf-8 00472 //Also call things that are compatible with utf-8, utf-8 (e.g. ascii) 00473 case "\x1b(B": // ascii 00474 case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version) 00475 $c = 'UTF-8'; 00476 break; 00477 case "\x1b(A": //like ascii, but british. 00478 $c = 'ISO646-GB'; 00479 break; 00480 case "\x1b(C": //some obscure sweedish/finland encoding 00481 $c = 'ISO-IR-8-1'; 00482 break; 00483 case "\x1b(D": 00484 $c = 'ISO-IR-8-2'; 00485 break; 00486 case "\x1b(E": //some obscure danish/norway encoding 00487 $c = 'ISO-IR-9-1'; 00488 break; 00489 case "\x1b(F": 00490 $c = 'ISO-IR-9-2'; 00491 break; 00492 case "\x1b(G": 00493 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like 00494 break; 00495 case "\x1b(I": 00496 $c = "ISO646-IT"; 00497 break; 00498 case "\x1b(L": 00499 $c = "ISO646-PT"; 00500 break; 00501 case "\x1b(Z": 00502 $c = "ISO646-ES"; 00503 break; 00504 case "\x1b([": 00505 $c = "GREEK7-OLD"; 00506 break; 00507 case "\x1b(K": 00508 $c = "ISO646-DE"; 00509 break; 00510 case "\x1b(N": //crylic 00511 $c = "ISO_5427"; 00512 break; 00513 case "\x1b(`": //iso646-NO 00514 $c = "NS_4551-1"; 00515 break; 00516 case "\x1b(f": //iso646-FR 00517 $c = "NF_Z_62-010"; 00518 break; 00519 case "\x1b(g": 00520 $c = "PT2"; //iso646-PT2 00521 break; 00522 case "\x1b(h": 00523 $c = "ES2"; 00524 break; 00525 case "\x1b(i": //iso646-HU 00526 $c = "MSZ_7795.3"; 00527 break; 00528 case "\x1b(w": 00529 $c = "CSA_Z243.4-1985-1"; 00530 break; 00531 case "\x1b(x": 00532 $c = "CSA_Z243.4-1985-2"; 00533 break; 00534 case "\x1b\$(B": 00535 case "\x1b\$B": 00536 case "\x1b&@\x1b\$B": 00537 case "\x1b&@\x1b\$(B": 00538 $c = "JIS_C6226-1983"; 00539 break; 00540 case "\x1b-A": // iso-8859-1. at least for the high code characters. 00541 case "\x1b(@\x1b-A": 00542 case "\x1b(B\x1b-A": 00543 $c = 'ISO-8859-1'; 00544 break; 00545 case "\x1b-B": // iso-8859-2. at least for the high code characters. 00546 $c = 'ISO-8859-2'; 00547 break; 00548 case "\x1b-C": // iso-8859-3. at least for the high code characters. 00549 $c = 'ISO-8859-3'; 00550 break; 00551 case "\x1b-D": // iso-8859-4. at least for the high code characters. 00552 $c = 'ISO-8859-4'; 00553 break; 00554 case "\x1b-E": // iso-8859-5. at least for the high code characters. 00555 $c = 'ISO-8859-5'; 00556 break; 00557 case "\x1b-F": // iso-8859-6. at least for the high code characters. 00558 $c = 'ISO-8859-6'; 00559 break; 00560 case "\x1b-G": // iso-8859-7. at least for the high code characters. 00561 $c = 'ISO-8859-7'; 00562 break; 00563 case "\x1b-H": // iso-8859-8. at least for the high code characters. 00564 $c = 'ISO-8859-8'; 00565 break; 00566 case "\x1b-I": // CSN_369103. at least for the high code characters. 00567 $c = 'CSN_369103'; 00568 break; 00569 default: 00570 wfDebugLog('iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) ); 00571 //at this point just give up and refuse to parse iptc? 00572 $c = false; 00573 } 00574 return $c; 00575 } 00576 }