MediaWiki
REL1_24
|
00001 <?php 00029 class IPTC { 00040 static function parse( $rawData ) { 00041 $parsed = iptcparse( $rawData ); 00042 $data = array(); 00043 if ( !is_array( $parsed ) ) { 00044 return $data; 00045 } 00046 00047 $c = ''; 00048 //charset info contained in tag 1:90. 00049 if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) { 00050 $c = self::getCharset( $parsed['1#090'][0] ); 00051 if ( $c === false ) { 00052 //Unknown charset. refuse to parse. 00053 //note: There is a different between 00054 //unknown and no charset specified. 00055 return array(); 00056 } 00057 unset( $parsed['1#090'] ); 00058 } 00059 00060 foreach ( $parsed as $tag => $val ) { 00061 if ( isset( $val[0] ) && trim( $val[0] ) == '' ) { 00062 wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." ); 00063 continue; 00064 } 00065 switch ( $tag ) { 00066 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/ 00067 $data['ImageDescription'] = self::convIPTC( $val, $c ); 00068 break; 00069 case '2#116': /* copyright. Mapped with exif copyright */ 00070 $data['Copyright'] = self::convIPTC( $val, $c ); 00071 break; 00072 case '2#080': /* byline. Mapped with exif Artist */ 00073 /* merge with byline title (2:85) 00074 * like how exif does it with 00075 * Title, person. Not sure if this is best 00076 * approach since we no longer have the two fields 00077 * separate. each byline title entry corresponds to a 00078 * specific byline. */ 00079 00080 $bylines = self::convIPTC( $val, $c ); 00081 if ( isset( $parsed['2#085'] ) ) { 00082 $titles = self::convIPTC( $parsed['2#085'], $c ); 00083 } else { 00084 $titles = array(); 00085 } 00086 00087 $titleCount = count( $titles ); 00088 for ( $i = 0; $i < $titleCount; $i++ ) { 00089 if ( isset( $bylines[$i] ) ) { 00090 // theoretically this should always be set 00091 // but doesn't hurt to be careful. 00092 $bylines[$i] = $titles[$i] . ', ' . $bylines[$i]; 00093 } 00094 } 00095 $data['Artist'] = $bylines; 00096 break; 00097 case '2#025': /* keywords */ 00098 $data['Keywords'] = self::convIPTC( $val, $c ); 00099 break; 00100 case '2#101': /* Country (shown)*/ 00101 $data['CountryDest'] = self::convIPTC( $val, $c ); 00102 break; 00103 case '2#095': /* state/province (shown) */ 00104 $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c ); 00105 break; 00106 case '2#090': /* city (Shown) */ 00107 $data['CityDest'] = self::convIPTC( $val, $c ); 00108 break; 00109 case '2#092': /* sublocation (shown) */ 00110 $data['SublocationDest'] = self::convIPTC( $val, $c ); 00111 break; 00112 case '2#005': /* object name/title */ 00113 $data['ObjectName'] = self::convIPTC( $val, $c ); 00114 break; 00115 case '2#040': /* special instructions */ 00116 $data['SpecialInstructions'] = self::convIPTC( $val, $c ); 00117 break; 00118 case '2#105': /* headline*/ 00119 $data['Headline'] = self::convIPTC( $val, $c ); 00120 break; 00121 case '2#110': /* credit */ 00122 /*"Identifies the provider of the objectdata, 00123 * not necessarily the owner/creator". */ 00124 $data['Credit'] = self::convIPTC( $val, $c ); 00125 break; 00126 case '2#115': /* source */ 00127 /* "Identifies the original owner of the intellectual content of the 00128 *objectdata. This could be an agency, a member of an agency or 00129 *an individual." */ 00130 $data['Source'] = self::convIPTC( $val, $c ); 00131 break; 00132 00133 case '2#007': /* edit status (lead, correction, etc) */ 00134 $data['EditStatus'] = self::convIPTC( $val, $c ); 00135 break; 00136 case '2#015': /* category. deprecated. max 3 letters in theory, often more */ 00137 $data['iimCategory'] = self::convIPTC( $val, $c ); 00138 break; 00139 case '2#020': /* category. deprecated. */ 00140 $data['iimSupplementalCategory'] = self::convIPTC( $val, $c ); 00141 break; 00142 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/ 00143 $data['Urgency'] = self::convIPTC( $val, $c ); 00144 break; 00145 case '2#022': 00146 /* "Identifies objectdata that recurs often and predictably... 00147 * Example: Euroweather" */ 00148 $data['FixtureIdentifier'] = self::convIPTC( $val, $c ); 00149 break; 00150 case '2#026': 00151 /* Content location code (iso 3166 + some custom things) 00152 * ex: TUR (for turkey), XUN (for UN), XSP (outer space) 00153 * See wikipedia article on iso 3166 and appendix D of iim std. */ 00154 $data['LocationDestCode'] = self::convIPTC( $val, $c ); 00155 break; 00156 case '2#027': 00157 /* Content location name. Full printable name 00158 * of location of photo. */ 00159 $data['LocationDest'] = self::convIPTC( $val, $c ); 00160 break; 00161 case '2#065': 00162 /* Originating Program. 00163 * Combine with Program version (2:70) if present. 00164 */ 00165 $software = self::convIPTC( $val, $c ); 00166 00167 if ( count( $software ) !== 1 ) { 00168 //according to iim standard this cannot have multiple values 00169 //so if there is more than one, something weird is happening, 00170 //and we skip it. 00171 wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' ); 00172 break; 00173 } 00174 00175 if ( isset( $parsed['2#070'] ) ) { 00176 //if a version is set for the software. 00177 $softwareVersion = self::convIPTC( $parsed['2#070'], $c ); 00178 unset( $parsed['2#070'] ); 00179 $data['Software'] = array( array( $software[0], $softwareVersion[0] ) ); 00180 } else { 00181 $data['Software'] = $software; 00182 } 00183 break; 00184 case '2#075': 00185 /* Object cycle. 00186 * a for morning (am), p for evening, b for both */ 00187 $data['ObjectCycle'] = self::convIPTC( $val, $c ); 00188 break; 00189 case '2#100': 00190 /* Country/Primary location code. 00191 * "Indicates the code of the country/primary location where the 00192 * intellectual property of the objectdata was created" 00193 * unclear how this differs from 2#026 00194 */ 00195 $data['CountryCodeDest'] = self::convIPTC( $val, $c ); 00196 break; 00197 case '2#103': 00198 /* original transmission ref. 00199 * "A code representing the location of original transmission ac- 00200 * cording to practises of the provider." 00201 */ 00202 $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c ); 00203 break; 00204 case '2#118': /*contact*/ 00205 $data['Contact'] = self::convIPTC( $val, $c ); 00206 break; 00207 case '2#122': 00208 /* Writer/Editor 00209 * "Identification of the name of the person involved in the writing, 00210 * editing or correcting the objectdata or caption/abstract." 00211 */ 00212 $data['Writer'] = self::convIPTC( $val, $c ); 00213 break; 00214 case '2#135': /* lang code */ 00215 $data['LanguageCode'] = self::convIPTC( $val, $c ); 00216 break; 00217 00218 // Start date stuff. 00219 // It doesn't accept incomplete dates even though they are valid 00220 // according to spec. 00221 // Should potentially store timezone as well. 00222 case '2#055': 00223 //Date created (not date digitized). 00224 //Maps to exif DateTimeOriginal 00225 if ( isset( $parsed['2#060'] ) ) { 00226 $time = $parsed['2#060']; 00227 } else { 00228 $time = array(); 00229 } 00230 $timestamp = self::timeHelper( $val, $time, $c ); 00231 if ( $timestamp ) { 00232 $data['DateTimeOriginal'] = $timestamp; 00233 } 00234 break; 00235 00236 case '2#062': 00237 //Date converted to digital representation. 00238 //Maps to exif DateTimeDigitized 00239 if ( isset( $parsed['2#063'] ) ) { 00240 $time = $parsed['2#063']; 00241 } else { 00242 $time = array(); 00243 } 00244 $timestamp = self::timeHelper( $val, $time, $c ); 00245 if ( $timestamp ) { 00246 $data['DateTimeDigitized'] = $timestamp; 00247 } 00248 break; 00249 00250 case '2#030': 00251 //Date released. 00252 if ( isset( $parsed['2#035'] ) ) { 00253 $time = $parsed['2#035']; 00254 } else { 00255 $time = array(); 00256 } 00257 $timestamp = self::timeHelper( $val, $time, $c ); 00258 if ( $timestamp ) { 00259 $data['DateTimeReleased'] = $timestamp; 00260 } 00261 break; 00262 00263 case '2#037': 00264 //Date expires. 00265 if ( isset( $parsed['2#038'] ) ) { 00266 $time = $parsed['2#038']; 00267 } else { 00268 $time = array(); 00269 } 00270 $timestamp = self::timeHelper( $val, $time, $c ); 00271 if ( $timestamp ) { 00272 $data['DateTimeExpires'] = $timestamp; 00273 } 00274 break; 00275 00276 case '2#000': /* iim version */ 00277 // unlike other tags, this is a 2-byte binary number. 00278 //technically this is required if there is iptc data 00279 //but in practise it isn't always there. 00280 if ( strlen( $val[0] ) == 2 ) { 00281 //if is just to be paranoid. 00282 $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256; 00283 $versionValue += ord( substr( $val[0], 1, 1 ) ); 00284 $data['iimVersion'] = $versionValue; 00285 } 00286 break; 00287 00288 case '2#004': 00289 // IntellectualGenere. 00290 // first 4 characters are an id code 00291 // That we're not really interested in. 00292 00293 // This prop is weird, since it's 00294 // allowed to have multiple values 00295 // in iim 4.1, but not in the XMP 00296 // stuff. We're going to just 00297 // extract the first value. 00298 $con = self::ConvIPTC( $val, $c ); 00299 if ( strlen( $con[0] ) < 5 ) { 00300 wfDebugLog( 'iptc', 'IPTC: ' 00301 . '2:04 too short. ' 00302 . 'Ignoring.' ); 00303 break; 00304 } 00305 $extracted = substr( $con[0], 4 ); 00306 $data['IntellectualGenre'] = $extracted; 00307 break; 00308 00309 case '2#012': 00310 // Subject News code - this is a compound field 00311 // at the moment we only extract the subject news 00312 // code, which is an 8 digit (ascii) number 00313 // describing the subject matter of the content. 00314 $codes = self::convIPTC( $val, $c ); 00315 foreach ( $codes as $ic ) { 00316 $fields = explode( ':', $ic, 3 ); 00317 00318 if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) { 00319 wfDebugLog( 'IPTC', 'IPTC: ' 00320 . 'Invalid 2:12 - ' . $ic ); 00321 break; 00322 } 00323 $data['SubjectNewsCode'] = $fields[1]; 00324 } 00325 break; 00326 00327 // purposely does not do 2:125, 2:130, 2:131, 00328 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3 00329 // 2:200, 2:201, 2:202 00330 // or the audio stuff (2:150 to 2:154) 00331 00332 case '2#070': 00333 case '2#060': 00334 case '2#063': 00335 case '2#085': 00336 case '2#038': 00337 case '2#035': 00338 //ignore. Handled elsewhere. 00339 break; 00340 00341 default: 00342 wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) ); 00343 break; 00344 } 00345 } 00346 00347 return $data; 00348 } 00349 00359 private static function timeHelper( $date, $time, $c ) { 00360 if ( count( $date ) === 1 ) { 00361 //the standard says this should always be 1 00362 //just double checking. 00363 list( $date ) = self::convIPTC( $date, $c ); 00364 } else { 00365 return null; 00366 } 00367 00368 if ( count( $time ) === 1 ) { 00369 list( $time ) = self::convIPTC( $time, $c ); 00370 $dateOnly = false; 00371 } else { 00372 $time = '000000+0000'; //placeholder 00373 $dateOnly = true; 00374 } 00375 00376 if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time ) 00377 && preg_match( '/\d\d\d\d\d\d\d\d/', $date ) 00378 && substr( $date, 0, 4 ) !== '0000' 00379 && substr( $date, 4, 2 ) !== '00' 00380 && substr( $date, 6, 2 ) !== '00' 00381 ) ) { 00382 //something wrong. 00383 // Note, this rejects some valid dates according to iptc spec 00384 // for example: the date 00000400 means the photo was taken in 00385 // April, but the year and day is unknown. We don't process these 00386 // types of incomplete dates atm. 00387 wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" ); 00388 00389 return null; 00390 } 00391 00392 $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) ); 00393 if ( $unixTS === false ) { 00394 wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." ); 00395 00396 return null; 00397 } 00398 00399 $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 ) 00400 + ( intval( substr( $time, 9, 2 ) ) * 60 ); 00401 00402 if ( substr( $time, 6, 1 ) === '-' ) { 00403 $tz = -$tz; 00404 } 00405 00406 $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz ); 00407 if ( $finalTimestamp === false ) { 00408 wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) ); 00409 00410 return null; 00411 } 00412 if ( $dateOnly ) { 00413 //return the date only 00414 return substr( $finalTimestamp, 0, 10 ); 00415 } else { 00416 return $finalTimestamp; 00417 } 00418 } 00419 00427 private static function convIPTC( $data, $charset ) { 00428 if ( is_array( $data ) ) { 00429 foreach ( $data as &$val ) { 00430 $val = self::convIPTCHelper( $val, $charset ); 00431 } 00432 } else { 00433 $data = self::convIPTCHelper( $data, $charset ); 00434 } 00435 00436 return $data; 00437 } 00438 00446 private static function convIPTCHelper( $data, $charset ) { 00447 if ( $charset ) { 00448 wfSuppressWarnings(); 00449 $data = iconv( $charset, "UTF-8//IGNORE", $data ); 00450 wfRestoreWarnings(); 00451 if ( $data === false ) { 00452 $data = ""; 00453 wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" ); 00454 } 00455 } else { 00456 //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252 00457 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8 00458 $oldData = $data; 00459 UtfNormal::quickIsNFCVerify( $data ); //make $data valid utf-8 00460 if ( $data === $oldData ) { 00461 return $data; //if validation didn't change $data 00462 } else { 00463 return self::convIPTCHelper( $oldData, 'Windows-1252' ); 00464 } 00465 } 00466 00467 return trim( $data ); 00468 } 00469 00478 static function getCharset( $tag ) { 00479 00480 //According to iim standard, charset is defined by the tag 1:90. 00481 //in which there are iso 2022 escape sequences to specify the character set. 00482 //the iim standard seems to encourage that all necessary escape sequences are 00483 //in the 1:90 tag, but says it doesn't have to be. 00484 00485 //This is in need of more testing probably. This is definitely not complete. 00486 //however reading the docs of some other iptc software, it appears that most iptc software 00487 //only recognizes utf-8. If 1:90 tag is not present content is 00488 // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee. 00489 00490 //This also won't work if there are more than one escape sequence in the 1:90 tag 00491 //or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8. 00492 00493 // This is just going through the charsets mentioned in appendix C of the iim standard. 00494 00495 // \x1b = ESC. 00496 switch ( $tag ) { 00497 case "\x1b%G": //utf-8 00498 //Also call things that are compatible with utf-8, utf-8 (e.g. ascii) 00499 case "\x1b(B": // ascii 00500 case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version) 00501 $c = 'UTF-8'; 00502 break; 00503 case "\x1b(A": //like ascii, but british. 00504 $c = 'ISO646-GB'; 00505 break; 00506 case "\x1b(C": //some obscure sweedish/finland encoding 00507 $c = 'ISO-IR-8-1'; 00508 break; 00509 case "\x1b(D": 00510 $c = 'ISO-IR-8-2'; 00511 break; 00512 case "\x1b(E": //some obscure danish/norway encoding 00513 $c = 'ISO-IR-9-1'; 00514 break; 00515 case "\x1b(F": 00516 $c = 'ISO-IR-9-2'; 00517 break; 00518 case "\x1b(G": 00519 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like 00520 break; 00521 case "\x1b(I": 00522 $c = "ISO646-IT"; 00523 break; 00524 case "\x1b(L": 00525 $c = "ISO646-PT"; 00526 break; 00527 case "\x1b(Z": 00528 $c = "ISO646-ES"; 00529 break; 00530 case "\x1b([": 00531 $c = "GREEK7-OLD"; 00532 break; 00533 case "\x1b(K": 00534 $c = "ISO646-DE"; 00535 break; 00536 case "\x1b(N": //crylic 00537 $c = "ISO_5427"; 00538 break; 00539 case "\x1b(`": //iso646-NO 00540 $c = "NS_4551-1"; 00541 break; 00542 case "\x1b(f": //iso646-FR 00543 $c = "NF_Z_62-010"; 00544 break; 00545 case "\x1b(g": 00546 $c = "PT2"; //iso646-PT2 00547 break; 00548 case "\x1b(h": 00549 $c = "ES2"; 00550 break; 00551 case "\x1b(i": //iso646-HU 00552 $c = "MSZ_7795.3"; 00553 break; 00554 case "\x1b(w": 00555 $c = "CSA_Z243.4-1985-1"; 00556 break; 00557 case "\x1b(x": 00558 $c = "CSA_Z243.4-1985-2"; 00559 break; 00560 case "\x1b\$(B": 00561 case "\x1b\$B": 00562 case "\x1b&@\x1b\$B": 00563 case "\x1b&@\x1b\$(B": 00564 $c = "JIS_C6226-1983"; 00565 break; 00566 case "\x1b-A": // iso-8859-1. at least for the high code characters. 00567 case "\x1b(@\x1b-A": 00568 case "\x1b(B\x1b-A": 00569 $c = 'ISO-8859-1'; 00570 break; 00571 case "\x1b-B": // iso-8859-2. at least for the high code characters. 00572 $c = 'ISO-8859-2'; 00573 break; 00574 case "\x1b-C": // iso-8859-3. at least for the high code characters. 00575 $c = 'ISO-8859-3'; 00576 break; 00577 case "\x1b-D": // iso-8859-4. at least for the high code characters. 00578 $c = 'ISO-8859-4'; 00579 break; 00580 case "\x1b-E": // iso-8859-5. at least for the high code characters. 00581 $c = 'ISO-8859-5'; 00582 break; 00583 case "\x1b-F": // iso-8859-6. at least for the high code characters. 00584 $c = 'ISO-8859-6'; 00585 break; 00586 case "\x1b-G": // iso-8859-7. at least for the high code characters. 00587 $c = 'ISO-8859-7'; 00588 break; 00589 case "\x1b-H": // iso-8859-8. at least for the high code characters. 00590 $c = 'ISO-8859-8'; 00591 break; 00592 case "\x1b-I": // CSN_369103. at least for the high code characters. 00593 $c = 'CSN_369103'; 00594 break; 00595 default: 00596 wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) ); 00597 //at this point just give up and refuse to parse iptc? 00598 $c = false; 00599 } 00600 return $c; 00601 } 00602 }