[ Index ] |
PHP Cross Reference of MediaWiki-1.24.0 |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Reader for XMP data containing properties relevant to images. 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License along 16 * with this program; if not, write to the Free Software Foundation, Inc., 17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 * http://www.gnu.org/copyleft/gpl.html 19 * 20 * @file 21 * @ingroup Media 22 */ 23 24 /** 25 * Class for reading xmp data containing properties relevant to 26 * images, and spitting out an array that FormatMetadata accepts. 27 * 28 * Note, this is not meant to recognize every possible thing you can 29 * encode in XMP. It should recognize all the properties we want. 30 * For example it doesn't have support for structures with multiple 31 * nesting levels, as none of the properties we're supporting use that 32 * feature. If it comes across properties it doesn't recognize, it should 33 * ignore them. 34 * 35 * The public methods one would call in this class are 36 * - parse( $content ) 37 * Reads in xmp content. 38 * Can potentially be called multiple times with partial data each time. 39 * - parseExtended( $content ) 40 * Reads XMPExtended blocks (jpeg files only). 41 * - getResults 42 * Outputs a results array. 43 * 44 * Note XMP kind of looks like rdf. They are not the same thing - XMP is 45 * encoded as a specific subset of rdf. This class can read XMP. It cannot 46 * read rdf. 47 * 48 */ 49 class XMPReader { 50 /** @var array XMP item configuration array */ 51 protected $items; 52 53 /** @var array Array to hold the current element (and previous element, and so on) */ 54 private $curItem = array(); 55 56 /** @var bool|string The structure name when processing nested structures. */ 57 private $ancestorStruct = false; 58 59 /** @var bool|string Temporary holder for character data that appears in xmp doc. */ 60 private $charContent = false; 61 62 /** @var array Stores the state the xmpreader is in (see MODE_FOO constants) */ 63 private $mode = array(); 64 65 /** @var array Array to hold results */ 66 private $results = array(); 67 68 /** @var bool If we're doing a seq or bag. */ 69 private $processingArray = false; 70 71 /** @var bool|string Used for lang alts only */ 72 private $itemLang = false; 73 74 /** @var resource A resource handle for the XML parser */ 75 private $xmlParser; 76 77 /** @var bool|string Character set like 'UTF-8' */ 78 private $charset = false; 79 80 /** @var int */ 81 private $extendedXMPOffset = 0; 82 83 /** 84 * These are various mode constants. 85 * they are used to figure out what to do 86 * with an element when its encountered. 87 * 88 * For example, MODE_IGNORE is used when processing 89 * a property we're not interested in. So if a new 90 * element pops up when we're in that mode, we ignore it. 91 */ 92 const MODE_INITIAL = 0; 93 const MODE_IGNORE = 1; 94 const MODE_LI = 2; 95 const MODE_LI_LANG = 3; 96 const MODE_QDESC = 4; 97 98 // The following MODE constants are also used in the 99 // $items array to denote what type of property the item is. 100 const MODE_SIMPLE = 10; 101 const MODE_STRUCT = 11; // structure (associative array) 102 const MODE_SEQ = 12; // ordered list 103 const MODE_BAG = 13; // unordered list 104 const MODE_LANG = 14; 105 const MODE_ALT = 15; // non-language alt. Currently not implemented, and not needed atm. 106 const MODE_BAGSTRUCT = 16; // A BAG of Structs. 107 108 const NS_RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'; 109 const NS_XML = 'http://www.w3.org/XML/1998/namespace'; 110 111 /** 112 * Constructor. 113 * 114 * Primary job is to initialize the XMLParser 115 */ 116 function __construct() { 117 118 if ( !function_exists( 'xml_parser_create_ns' ) ) { 119 // this should already be checked by this point 120 throw new MWException( 'XMP support requires XML Parser' ); 121 } 122 123 $this->items = XMPInfo::getItems(); 124 125 $this->resetXMLParser(); 126 } 127 128 /** 129 * Main use is if a single item has multiple xmp documents describing it. 130 * For example in jpeg's with extendedXMP 131 */ 132 private function resetXMLParser() { 133 134 if ( $this->xmlParser ) { 135 //is this needed? 136 xml_parser_free( $this->xmlParser ); 137 } 138 139 $this->xmlParser = xml_parser_create_ns( 'UTF-8', ' ' ); 140 xml_parser_set_option( $this->xmlParser, XML_OPTION_CASE_FOLDING, 0 ); 141 xml_parser_set_option( $this->xmlParser, XML_OPTION_SKIP_WHITE, 1 ); 142 143 xml_set_element_handler( $this->xmlParser, 144 array( $this, 'startElement' ), 145 array( $this, 'endElement' ) ); 146 147 xml_set_character_data_handler( $this->xmlParser, array( $this, 'char' ) ); 148 } 149 150 /** Destroy the xml parser 151 * 152 * Not sure if this is actually needed. 153 */ 154 function __destruct() { 155 // not sure if this is needed. 156 xml_parser_free( $this->xmlParser ); 157 } 158 159 /** Get the result array. Do some post-processing before returning 160 * the array, and transform any metadata that is special-cased. 161 * 162 * @return array Array of results as an array of arrays suitable for 163 * FormatMetadata::getFormattedData(). 164 */ 165 public function getResults() { 166 // xmp-special is for metadata that affects how stuff 167 // is extracted. For example xmpNote:HasExtendedXMP. 168 169 // It is also used to handle photoshop:AuthorsPosition 170 // which is weird and really part of another property, 171 // see 2:85 in IPTC. See also pg 21 of IPTC4XMP standard. 172 // The location fields also use it. 173 174 $data = $this->results; 175 176 wfRunHooks( 'XMPGetResults', array( &$data ) ); 177 178 if ( isset( $data['xmp-special']['AuthorsPosition'] ) 179 && is_string( $data['xmp-special']['AuthorsPosition'] ) 180 && isset( $data['xmp-general']['Artist'][0] ) 181 ) { 182 // Note, if there is more than one creator, 183 // this only applies to first. This also will 184 // only apply to the dc:Creator prop, not the 185 // exif:Artist prop. 186 187 $data['xmp-general']['Artist'][0] = 188 $data['xmp-special']['AuthorsPosition'] . ', ' 189 . $data['xmp-general']['Artist'][0]; 190 } 191 192 // Go through the LocationShown and LocationCreated 193 // changing it to the non-hierarchal form used by 194 // the other location fields. 195 196 if ( isset( $data['xmp-special']['LocationShown'][0] ) 197 && is_array( $data['xmp-special']['LocationShown'][0] ) 198 ) { 199 // the is_array is just paranoia. It should always 200 // be an array. 201 foreach ( $data['xmp-special']['LocationShown'] as $loc ) { 202 if ( !is_array( $loc ) ) { 203 // To avoid copying over the _type meta-fields. 204 continue; 205 } 206 foreach ( $loc as $field => $val ) { 207 $data['xmp-general'][$field . 'Dest'][] = $val; 208 } 209 } 210 } 211 if ( isset( $data['xmp-special']['LocationCreated'][0] ) 212 && is_array( $data['xmp-special']['LocationCreated'][0] ) 213 ) { 214 // the is_array is just paranoia. It should always 215 // be an array. 216 foreach ( $data['xmp-special']['LocationCreated'] as $loc ) { 217 if ( !is_array( $loc ) ) { 218 // To avoid copying over the _type meta-fields. 219 continue; 220 } 221 foreach ( $loc as $field => $val ) { 222 $data['xmp-general'][$field . 'Created'][] = $val; 223 } 224 } 225 } 226 227 // We don't want to return the special values, since they're 228 // special and not info to be stored about the file. 229 unset( $data['xmp-special'] ); 230 231 // Convert GPSAltitude to negative if below sea level. 232 if ( isset( $data['xmp-exif']['GPSAltitudeRef'] ) 233 && isset( $data['xmp-exif']['GPSAltitude'] ) 234 ) { 235 236 // Must convert to a real before multiplying by -1 237 // XMPValidate guarantees there will always be a '/' in this value. 238 list( $nom, $denom ) = explode( '/', $data['xmp-exif']['GPSAltitude'] ); 239 $data['xmp-exif']['GPSAltitude'] = $nom / $denom; 240 241 if ( $data['xmp-exif']['GPSAltitudeRef'] == '1' ) { 242 $data['xmp-exif']['GPSAltitude'] *= -1; 243 } 244 unset( $data['xmp-exif']['GPSAltitudeRef'] ); 245 } 246 247 return $data; 248 } 249 250 /** 251 * Main function to call to parse XMP. Use getResults to 252 * get results. 253 * 254 * Also catches any errors during processing, writes them to 255 * debug log, blanks result array and returns false. 256 * 257 * @param string $content XMP data 258 * @param bool $allOfIt If this is all the data (true) or if its split up (false). Default true 259 * @param bool $reset Does xml parser need to be reset. Default false 260 * @throws MWException 261 * @return bool Success. 262 */ 263 public function parse( $content, $allOfIt = true, $reset = false ) { 264 if ( $reset ) { 265 $this->resetXMLParser(); 266 } 267 try { 268 269 // detect encoding by looking for BOM which is supposed to be in processing instruction. 270 // see page 12 of http://www.adobe.com/devnet/xmp/pdfs/XMPSpecificationPart3.pdf 271 if ( !$this->charset ) { 272 $bom = array(); 273 if ( preg_match( '/\xEF\xBB\xBF|\xFE\xFF|\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\xFF\xFE/', 274 $content, $bom ) 275 ) { 276 switch ( $bom[0] ) { 277 case "\xFE\xFF": 278 $this->charset = 'UTF-16BE'; 279 break; 280 case "\xFF\xFE": 281 $this->charset = 'UTF-16LE'; 282 break; 283 case "\x00\x00\xFE\xFF": 284 $this->charset = 'UTF-32BE'; 285 break; 286 case "\xFF\xFE\x00\x00": 287 $this->charset = 'UTF-32LE'; 288 break; 289 case "\xEF\xBB\xBF": 290 $this->charset = 'UTF-8'; 291 break; 292 default: 293 //this should be impossible to get to 294 throw new MWException( "Invalid BOM" ); 295 } 296 } else { 297 // standard specifically says, if no bom assume utf-8 298 $this->charset = 'UTF-8'; 299 } 300 } 301 if ( $this->charset !== 'UTF-8' ) { 302 //don't convert if already utf-8 303 wfSuppressWarnings(); 304 $content = iconv( $this->charset, 'UTF-8//IGNORE', $content ); 305 wfRestoreWarnings(); 306 } 307 308 $ok = xml_parse( $this->xmlParser, $content, $allOfIt ); 309 if ( !$ok ) { 310 $error = xml_error_string( xml_get_error_code( $this->xmlParser ) ); 311 $where = 'line: ' . xml_get_current_line_number( $this->xmlParser ) 312 . ' column: ' . xml_get_current_column_number( $this->xmlParser ) 313 . ' byte offset: ' . xml_get_current_byte_index( $this->xmlParser ); 314 315 wfDebugLog( 'XMP', "XMPReader::parse : Error reading XMP content: $error ($where)" ); 316 $this->results = array(); // blank if error. 317 return false; 318 } 319 } catch ( MWException $e ) { 320 wfDebugLog( 'XMP', 'XMP parse error: ' . $e ); 321 $this->results = array(); 322 323 return false; 324 } 325 326 return true; 327 } 328 329 /** Entry point for XMPExtended blocks in jpeg files 330 * 331 * @todo In serious need of testing 332 * @see http://www.adobe.ge/devnet/xmp/pdfs/XMPSpecificationPart3.pdf XMP spec part 3 page 20 333 * @param string $content XMPExtended block minus the namespace signature 334 * @return bool If it succeeded. 335 */ 336 public function parseExtended( $content ) { 337 // @todo FIXME: This is untested. Hard to find example files 338 // or programs that make such files.. 339 $guid = substr( $content, 0, 32 ); 340 if ( !isset( $this->results['xmp-special']['HasExtendedXMP'] ) 341 || $this->results['xmp-special']['HasExtendedXMP'] !== $guid 342 ) { 343 wfDebugLog( 'XMP', __METHOD__ . 344 " Ignoring XMPExtended block due to wrong guid (guid= '$guid')" ); 345 346 return false; 347 } 348 $len = unpack( 'Nlength/Noffset', substr( $content, 32, 8 ) ); 349 350 if ( !$len || $len['length'] < 4 || $len['offset'] < 0 || $len['offset'] > $len['length'] ) { 351 wfDebugLog( 'XMP', __METHOD__ . 'Error reading extended XMP block, invalid length or offset.' ); 352 353 return false; 354 } 355 356 // we're not very robust here. we should accept it in the wrong order. 357 // To quote the XMP standard: 358 // "A JPEG writer should write the ExtendedXMP marker segments in order, 359 // immediately following the StandardXMP. However, the JPEG standard 360 // does not require preservation of marker segment order. A robust JPEG 361 // reader should tolerate the marker segments in any order." 362 // 363 // otoh the probability that an image will have more than 128k of 364 // metadata is rather low... so the probability that it will have 365 // > 128k, and be in the wrong order is very low... 366 367 if ( $len['offset'] !== $this->extendedXMPOffset ) { 368 wfDebugLog( 'XMP', __METHOD__ . 'Ignoring XMPExtended block due to wrong order. (Offset was ' 369 . $len['offset'] . ' but expected ' . $this->extendedXMPOffset . ')' ); 370 371 return false; 372 } 373 374 if ( $len['offset'] === 0 ) { 375 // if we're starting the extended block, we've probably already 376 // done the XMPStandard block, so reset. 377 $this->resetXMLParser(); 378 } 379 380 $this->extendedXMPOffset += $len['length']; 381 382 $actualContent = substr( $content, 40 ); 383 384 if ( $this->extendedXMPOffset === strlen( $actualContent ) ) { 385 $atEnd = true; 386 } else { 387 $atEnd = false; 388 } 389 390 wfDebugLog( 'XMP', __METHOD__ . 'Parsing a XMPExtended block' ); 391 392 return $this->parse( $actualContent, $atEnd ); 393 } 394 395 /** 396 * Character data handler 397 * Called whenever character data is found in the xmp document. 398 * 399 * does nothing if we're in MODE_IGNORE or if the data is whitespace 400 * throws an error if we're not in MODE_SIMPLE (as we're not allowed to have character 401 * data in the other modes). 402 * 403 * As an example, this happens when we encounter XMP like: 404 * <exif:DigitalZoomRatio>0/10</exif:DigitalZoomRatio> 405 * and are processing the 0/10 bit. 406 * 407 * @param XMLParser $parser XMLParser reference to the xml parser 408 * @param string $data Character data 409 * @throws MWException On invalid data 410 */ 411 function char( $parser, $data ) { 412 413 $data = trim( $data ); 414 if ( trim( $data ) === "" ) { 415 return; 416 } 417 418 if ( !isset( $this->mode[0] ) ) { 419 throw new MWException( 'Unexpected character data before first rdf:Description element' ); 420 } 421 422 if ( $this->mode[0] === self::MODE_IGNORE ) { 423 return; 424 } 425 426 if ( $this->mode[0] !== self::MODE_SIMPLE 427 && $this->mode[0] !== self::MODE_QDESC 428 ) { 429 throw new MWException( 'character data where not expected. (mode ' . $this->mode[0] . ')' ); 430 } 431 432 // to check, how does this handle w.s. 433 if ( $this->charContent === false ) { 434 $this->charContent = $data; 435 } else { 436 $this->charContent .= $data; 437 } 438 } 439 440 /** When we hit a closing element in MODE_IGNORE 441 * Check to see if this is the element we started to ignore, 442 * in which case we get out of MODE_IGNORE 443 * 444 * @param string $elm Namespace of element followed by a space and then tag name of element. 445 */ 446 private function endElementModeIgnore( $elm ) { 447 if ( $this->curItem[0] === $elm ) { 448 array_shift( $this->curItem ); 449 array_shift( $this->mode ); 450 } 451 } 452 453 /** 454 * Hit a closing element when in MODE_SIMPLE. 455 * This generally means that we finished processing a 456 * property value, and now have to save the result to the 457 * results array 458 * 459 * For example, when processing: 460 * <exif:DigitalZoomRatio>0/10</exif:DigitalZoomRatio> 461 * this deals with when we hit </exif:DigitalZoomRatio>. 462 * 463 * Or it could be if we hit the end element of a property 464 * of a compound data structure (like a member of an array). 465 * 466 * @param string $elm Namespace, space, and tag name. 467 */ 468 private function endElementModeSimple( $elm ) { 469 if ( $this->charContent !== false ) { 470 if ( $this->processingArray ) { 471 // if we're processing an array, use the original element 472 // name instead of rdf:li. 473 list( $ns, $tag ) = explode( ' ', $this->curItem[0], 2 ); 474 } else { 475 list( $ns, $tag ) = explode( ' ', $elm, 2 ); 476 } 477 $this->saveValue( $ns, $tag, $this->charContent ); 478 479 $this->charContent = false; // reset 480 } 481 array_shift( $this->curItem ); 482 array_shift( $this->mode ); 483 } 484 485 /** 486 * Hit a closing element in MODE_STRUCT, MODE_SEQ, MODE_BAG 487 * generally means we've finished processing a nested structure. 488 * resets some internal variables to indicate that. 489 * 490 * Note this means we hit the closing element not the "</rdf:Seq>". 491 * 492 * @par For example, when processing: 493 * @code{,xml} 494 * <exif:ISOSpeedRatings> <rdf:Seq> <rdf:li>64</rdf:li> 495 * </rdf:Seq> </exif:ISOSpeedRatings> 496 * @endcode 497 * 498 * This method is called when we hit the "</exif:ISOSpeedRatings>" tag. 499 * 500 * @param string $elm Namespace . space . tag name. 501 * @throws MWException 502 */ 503 private function endElementNested( $elm ) { 504 505 /* cur item must be the same as $elm, unless if in MODE_STRUCT 506 in which case it could also be rdf:Description */ 507 if ( $this->curItem[0] !== $elm 508 && !( $elm === self::NS_RDF . ' Description' 509 && $this->mode[0] === self::MODE_STRUCT ) 510 ) { 511 throw new MWException( "nesting mismatch. got a </$elm> but expected a </" . 512 $this->curItem[0] . '>' ); 513 } 514 515 // Validate structures. 516 list( $ns, $tag ) = explode( ' ', $elm, 2 ); 517 if ( isset( $this->items[$ns][$tag]['validate'] ) ) { 518 519 $info =& $this->items[$ns][$tag]; 520 $finalName = isset( $info['map_name'] ) 521 ? $info['map_name'] : $tag; 522 523 $validate = is_array( $info['validate'] ) ? $info['validate'] 524 : array( 'XMPValidate', $info['validate'] ); 525 526 if ( !isset( $this->results['xmp-' . $info['map_group']][$finalName] ) ) { 527 // This can happen if all the members of the struct failed validation. 528 wfDebugLog( 'XMP', __METHOD__ . " <$ns:$tag> has no valid members." ); 529 } elseif ( is_callable( $validate ) ) { 530 $val =& $this->results['xmp-' . $info['map_group']][$finalName]; 531 call_user_func_array( $validate, array( $info, &$val, false ) ); 532 if ( is_null( $val ) ) { 533 // the idea being the validation function will unset the variable if 534 // its invalid. 535 wfDebugLog( 'XMP', __METHOD__ . " <$ns:$tag> failed validation." ); 536 unset( $this->results['xmp-' . $info['map_group']][$finalName] ); 537 } 538 } else { 539 wfDebugLog( 'XMP', __METHOD__ . " Validation function for $finalName (" 540 . $validate[0] . '::' . $validate[1] . '()) is not callable.' ); 541 } 542 } 543 544 array_shift( $this->curItem ); 545 array_shift( $this->mode ); 546 $this->ancestorStruct = false; 547 $this->processingArray = false; 548 $this->itemLang = false; 549 } 550 551 /** 552 * Hit a closing element in MODE_LI (either rdf:Seq, or rdf:Bag ) 553 * Add information about what type of element this is. 554 * 555 * Note we still have to hit the outer "</property>" 556 * 557 * @par For example, when processing: 558 * @code{,xml} 559 * <exif:ISOSpeedRatings> <rdf:Seq> <rdf:li>64</rdf:li> 560 * </rdf:Seq> </exif:ISOSpeedRatings> 561 * @endcode 562 * 563 * This method is called when we hit the "</rdf:Seq>". 564 * (For comparison, we call endElementModeSimple when we 565 * hit the "</rdf:li>") 566 * 567 * @param string $elm Namespace . ' ' . element name 568 * @throws MWException 569 */ 570 private function endElementModeLi( $elm ) { 571 572 list( $ns, $tag ) = explode( ' ', $this->curItem[0], 2 ); 573 $info = $this->items[$ns][$tag]; 574 $finalName = isset( $info['map_name'] ) 575 ? $info['map_name'] : $tag; 576 577 array_shift( $this->mode ); 578 579 if ( !isset( $this->results['xmp-' . $info['map_group']][$finalName] ) ) { 580 wfDebugLog( 'XMP', __METHOD__ . " Empty compund element $finalName." ); 581 582 return; 583 } 584 585 if ( $elm === self::NS_RDF . ' Seq' ) { 586 $this->results['xmp-' . $info['map_group']][$finalName]['_type'] = 'ol'; 587 } elseif ( $elm === self::NS_RDF . ' Bag' ) { 588 $this->results['xmp-' . $info['map_group']][$finalName]['_type'] = 'ul'; 589 } elseif ( $elm === self::NS_RDF . ' Alt' ) { 590 // extra if needed as you could theoretically have a non-language alt. 591 if ( $info['mode'] === self::MODE_LANG ) { 592 $this->results['xmp-' . $info['map_group']][$finalName]['_type'] = 'lang'; 593 } 594 } else { 595 throw new MWException( __METHOD__ . " expected </rdf:seq> or </rdf:bag> but instead got $elm." ); 596 } 597 } 598 599 /** 600 * End element while in MODE_QDESC 601 * mostly when ending an element when we have a simple value 602 * that has qualifiers. 603 * 604 * Qualifiers aren't all that common, and we don't do anything 605 * with them. 606 * 607 * @param string $elm Namespace and element 608 */ 609 private function endElementModeQDesc( $elm ) { 610 611 if ( $elm === self::NS_RDF . ' value' ) { 612 list( $ns, $tag ) = explode( ' ', $this->curItem[0], 2 ); 613 $this->saveValue( $ns, $tag, $this->charContent ); 614 615 return; 616 } else { 617 array_shift( $this->mode ); 618 array_shift( $this->curItem ); 619 } 620 } 621 622 /** 623 * Handler for hitting a closing element. 624 * 625 * generally just calls a helper function depending on what 626 * mode we're in. 627 * 628 * Ignores the outer wrapping elements that are optional in 629 * xmp and have no meaning. 630 * 631 * @param XMLParser $parser 632 * @param string $elm Namespace . ' ' . element name 633 * @throws MWException 634 */ 635 function endElement( $parser, $elm ) { 636 if ( $elm === ( self::NS_RDF . ' RDF' ) 637 || $elm === 'adobe:ns:meta/ xmpmeta' 638 || $elm === 'adobe:ns:meta/ xapmeta' 639 ) { 640 // ignore these. 641 return; 642 } 643 644 if ( $elm === self::NS_RDF . ' type' ) { 645 // these aren't really supported properly yet. 646 // However, it appears they almost never used. 647 wfDebugLog( 'XMP', __METHOD__ . ' encountered <rdf:type>' ); 648 } 649 650 if ( strpos( $elm, ' ' ) === false ) { 651 // This probably shouldn't happen. 652 // However, there is a bug in an adobe product 653 // that forgets the namespace on some things. 654 // (Luckily they are unimportant things). 655 wfDebugLog( 'XMP', __METHOD__ . " Encountered </$elm> which has no namespace. Skipping." ); 656 657 return; 658 } 659 660 if ( count( $this->mode[0] ) === 0 ) { 661 // This should never ever happen and means 662 // there is a pretty major bug in this class. 663 throw new MWException( 'Encountered end element with no mode' ); 664 } 665 666 if ( count( $this->curItem ) == 0 && $this->mode[0] !== self::MODE_INITIAL ) { 667 // just to be paranoid. Should always have a curItem, except for initially 668 // (aka during MODE_INITAL). 669 throw new MWException( "Hit end element </$elm> but no curItem" ); 670 } 671 672 switch ( $this->mode[0] ) { 673 case self::MODE_IGNORE: 674 $this->endElementModeIgnore( $elm ); 675 break; 676 case self::MODE_SIMPLE: 677 $this->endElementModeSimple( $elm ); 678 break; 679 case self::MODE_STRUCT: 680 case self::MODE_SEQ: 681 case self::MODE_BAG: 682 case self::MODE_LANG: 683 case self::MODE_BAGSTRUCT: 684 $this->endElementNested( $elm ); 685 break; 686 case self::MODE_INITIAL: 687 if ( $elm === self::NS_RDF . ' Description' ) { 688 array_shift( $this->mode ); 689 } else { 690 throw new MWException( 'Element ended unexpectedly while in MODE_INITIAL' ); 691 } 692 break; 693 case self::MODE_LI: 694 case self::MODE_LI_LANG: 695 $this->endElementModeLi( $elm ); 696 break; 697 case self::MODE_QDESC: 698 $this->endElementModeQDesc( $elm ); 699 break; 700 default: 701 wfDebugLog( 'XMP', __METHOD__ . " no mode (elm = $elm)" ); 702 break; 703 } 704 } 705 706 /** 707 * Hit an opening element while in MODE_IGNORE 708 * 709 * XMP is extensible, so ignore any tag we don't understand. 710 * 711 * Mostly ignores, unless we encounter the element that we are ignoring. 712 * in which case we add it to the item stack, so we can ignore things 713 * that are nested, correctly. 714 * 715 * @param string $elm Namespace . ' ' . tag name 716 */ 717 private function startElementModeIgnore( $elm ) { 718 if ( $elm === $this->curItem[0] ) { 719 array_unshift( $this->curItem, $elm ); 720 array_unshift( $this->mode, self::MODE_IGNORE ); 721 } 722 } 723 724 /** 725 * Start element in MODE_BAG (unordered array) 726 * this should always be <rdf:Bag> 727 * 728 * @param string $elm Namespace . ' ' . tag 729 * @throws MWException If we have an element that's not <rdf:Bag> 730 */ 731 private function startElementModeBag( $elm ) { 732 if ( $elm === self::NS_RDF . ' Bag' ) { 733 array_unshift( $this->mode, self::MODE_LI ); 734 } else { 735 throw new MWException( "Expected <rdf:Bag> but got $elm." ); 736 } 737 } 738 739 /** 740 * Start element in MODE_SEQ (ordered array) 741 * this should always be <rdf:Seq> 742 * 743 * @param string $elm Namespace . ' ' . tag 744 * @throws MWException If we have an element that's not <rdf:Seq> 745 */ 746 private function startElementModeSeq( $elm ) { 747 if ( $elm === self::NS_RDF . ' Seq' ) { 748 array_unshift( $this->mode, self::MODE_LI ); 749 } elseif ( $elm === self::NS_RDF . ' Bag' ) { 750 # bug 27105 751 wfDebugLog( 'XMP', __METHOD__ . ' Expected an rdf:Seq, but got an rdf:Bag. Pretending' 752 . ' it is a Seq, since some buggy software is known to screw this up.' ); 753 array_unshift( $this->mode, self::MODE_LI ); 754 } else { 755 throw new MWException( "Expected <rdf:Seq> but got $elm." ); 756 } 757 } 758 759 /** 760 * Start element in MODE_LANG (language alternative) 761 * this should always be <rdf:Alt> 762 * 763 * This tag tends to be used for metadata like describe this 764 * picture, which can be translated into multiple languages. 765 * 766 * XMP supports non-linguistic alternative selections, 767 * which are really only used for thumbnails, which 768 * we don't care about. 769 * 770 * @param string $elm Namespace . ' ' . tag 771 * @throws MWException If we have an element that's not <rdf:Alt> 772 */ 773 private function startElementModeLang( $elm ) { 774 if ( $elm === self::NS_RDF . ' Alt' ) { 775 array_unshift( $this->mode, self::MODE_LI_LANG ); 776 } else { 777 throw new MWException( "Expected <rdf:Seq> but got $elm." ); 778 } 779 } 780 781 /** 782 * Handle an opening element when in MODE_SIMPLE 783 * 784 * This should not happen often. This is for if a simple element 785 * already opened has a child element. Could happen for a 786 * qualified element. 787 * 788 * For example: 789 * <exif:DigitalZoomRatio><rdf:Description><rdf:value>0/10</rdf:value> 790 * <foo:someQualifier>Bar</foo:someQualifier> </rdf:Description> 791 * </exif:DigitalZoomRatio> 792 * 793 * This method is called when processing the <rdf:Description> element 794 * 795 * @param string $elm Namespace and tag names separated by space. 796 * @param array $attribs Attributes of the element. 797 * @throws MWException 798 */ 799 private function startElementModeSimple( $elm, $attribs ) { 800 if ( $elm === self::NS_RDF . ' Description' ) { 801 // If this value has qualifiers 802 array_unshift( $this->mode, self::MODE_QDESC ); 803 array_unshift( $this->curItem, $this->curItem[0] ); 804 805 if ( isset( $attribs[self::NS_RDF . ' value'] ) ) { 806 list( $ns, $tag ) = explode( ' ', $this->curItem[0], 2 ); 807 $this->saveValue( $ns, $tag, $attribs[self::NS_RDF . ' value'] ); 808 } 809 } elseif ( $elm === self::NS_RDF . ' value' ) { 810 // This should not be here. 811 throw new MWException( __METHOD__ . ' Encountered <rdf:value> where it was unexpected.' ); 812 } else { 813 // something else we don't recognize, like a qualifier maybe. 814 wfDebugLog( 'XMP', __METHOD__ . 815 " Encountered element <$elm> where only expecting character data as value of " . 816 $this->curItem[0] ); 817 array_unshift( $this->mode, self::MODE_IGNORE ); 818 array_unshift( $this->curItem, $elm ); 819 } 820 } 821 822 /** 823 * Start an element when in MODE_QDESC. 824 * This generally happens when a simple element has an inner 825 * rdf:Description to hold qualifier elements. 826 * 827 * For example in: 828 * <exif:DigitalZoomRatio><rdf:Description><rdf:value>0/10</rdf:value> 829 * <foo:someQualifier>Bar</foo:someQualifier> </rdf:Description> 830 * </exif:DigitalZoomRatio> 831 * Called when processing the <rdf:value> or <foo:someQualifier>. 832 * 833 * @param string $elm Namespace and tag name separated by a space. 834 * 835 */ 836 private function startElementModeQDesc( $elm ) { 837 if ( $elm === self::NS_RDF . ' value' ) { 838 return; // do nothing 839 } else { 840 // otherwise its a qualifier, which we ignore 841 array_unshift( $this->mode, self::MODE_IGNORE ); 842 array_unshift( $this->curItem, $elm ); 843 } 844 } 845 846 /** 847 * Starting an element when in MODE_INITIAL 848 * This usually happens when we hit an element inside 849 * the outer rdf:Description 850 * 851 * This is generally where most properties start. 852 * 853 * @param string $ns Namespace 854 * @param string $tag Tag name (without namespace prefix) 855 * @param array $attribs Array of attributes 856 * @throws MWException 857 */ 858 private function startElementModeInitial( $ns, $tag, $attribs ) { 859 if ( $ns !== self::NS_RDF ) { 860 861 if ( isset( $this->items[$ns][$tag] ) ) { 862 if ( isset( $this->items[$ns][$tag]['structPart'] ) ) { 863 // If this element is supposed to appear only as 864 // a child of a structure, but appears here (not as 865 // a child of a struct), then something weird is 866 // happening, so ignore this element and its children. 867 868 wfDebugLog( 'XMP', "Encountered <$ns:$tag> outside" 869 . " of its expected parent. Ignoring." ); 870 871 array_unshift( $this->mode, self::MODE_IGNORE ); 872 array_unshift( $this->curItem, $ns . ' ' . $tag ); 873 874 return; 875 } 876 $mode = $this->items[$ns][$tag]['mode']; 877 array_unshift( $this->mode, $mode ); 878 array_unshift( $this->curItem, $ns . ' ' . $tag ); 879 if ( $mode === self::MODE_STRUCT ) { 880 $this->ancestorStruct = isset( $this->items[$ns][$tag]['map_name'] ) 881 ? $this->items[$ns][$tag]['map_name'] : $tag; 882 } 883 if ( $this->charContent !== false ) { 884 // Something weird. 885 // Should not happen in valid XMP. 886 throw new MWException( 'tag nested in non-whitespace characters.' ); 887 } 888 } else { 889 // This element is not on our list of allowed elements so ignore. 890 wfDebugLog( 'XMP', __METHOD__ . " Ignoring unrecognized element <$ns:$tag>." ); 891 array_unshift( $this->mode, self::MODE_IGNORE ); 892 array_unshift( $this->curItem, $ns . ' ' . $tag ); 893 894 return; 895 } 896 } 897 // process attributes 898 $this->doAttribs( $attribs ); 899 } 900 901 /** 902 * Hit an opening element when in a Struct (MODE_STRUCT) 903 * This is generally for fields of a compound property. 904 * 905 * Example of a struct (abbreviated; flash has more properties): 906 * 907 * <exif:Flash> <rdf:Description> <exif:Fired>True</exif:Fired> 908 * <exif:Mode>1</exif:Mode></rdf:Description></exif:Flash> 909 * 910 * or: 911 * 912 * <exif:Flash rdf:parseType='Resource'> <exif:Fired>True</exif:Fired> 913 * <exif:Mode>1</exif:Mode></exif:Flash> 914 * 915 * @param string $ns Namespace 916 * @param string $tag Tag name (no ns) 917 * @param array $attribs Array of attribs w/ values. 918 * @throws MWException 919 */ 920 private function startElementModeStruct( $ns, $tag, $attribs ) { 921 if ( $ns !== self::NS_RDF ) { 922 923 if ( isset( $this->items[$ns][$tag] ) ) { 924 if ( isset( $this->items[$ns][$this->ancestorStruct]['children'] ) 925 && !isset( $this->items[$ns][$this->ancestorStruct]['children'][$tag] ) 926 ) { 927 // This assumes that we don't have inter-namespace nesting 928 // which we don't in all the properties we're interested in. 929 throw new MWException( " <$tag> appeared nested in <" . $this->ancestorStruct 930 . "> where it is not allowed." ); 931 } 932 array_unshift( $this->mode, $this->items[$ns][$tag]['mode'] ); 933 array_unshift( $this->curItem, $ns . ' ' . $tag ); 934 if ( $this->charContent !== false ) { 935 // Something weird. 936 // Should not happen in valid XMP. 937 throw new MWException( "tag <$tag> nested in non-whitespace characters (" . 938 $this->charContent . ")." ); 939 } 940 } else { 941 array_unshift( $this->mode, self::MODE_IGNORE ); 942 array_unshift( $this->curItem, $elm ); 943 944 return; 945 } 946 } 947 948 if ( $ns === self::NS_RDF && $tag === 'Description' ) { 949 $this->doAttribs( $attribs ); 950 array_unshift( $this->mode, self::MODE_STRUCT ); 951 array_unshift( $this->curItem, $this->curItem[0] ); 952 } 953 } 954 955 /** 956 * opening element in MODE_LI 957 * process elements of arrays. 958 * 959 * Example: 960 * <exif:ISOSpeedRatings> <rdf:Seq> <rdf:li>64</rdf:li> 961 * </rdf:Seq> </exif:ISOSpeedRatings> 962 * This method is called when we hit the <rdf:li> element. 963 * 964 * @param string $elm Namespace . ' ' . tagname 965 * @param array $attribs Attributes. (needed for BAGSTRUCTS) 966 * @throws MWException If gets a tag other than <rdf:li> 967 */ 968 private function startElementModeLi( $elm, $attribs ) { 969 if ( ( $elm ) !== self::NS_RDF . ' li' ) { 970 throw new MWException( "<rdf:li> expected but got $elm." ); 971 } 972 973 if ( !isset( $this->mode[1] ) ) { 974 // This should never ever ever happen. Checking for it 975 // to be paranoid. 976 throw new MWException( 'In mode Li, but no 2xPrevious mode!' ); 977 } 978 979 if ( $this->mode[1] === self::MODE_BAGSTRUCT ) { 980 // This list item contains a compound (STRUCT) value. 981 array_unshift( $this->mode, self::MODE_STRUCT ); 982 array_unshift( $this->curItem, $elm ); 983 $this->processingArray = true; 984 985 if ( !isset( $this->curItem[1] ) ) { 986 // be paranoid. 987 throw new MWException( 'Can not find parent of BAGSTRUCT.' ); 988 } 989 list( $curNS, $curTag ) = explode( ' ', $this->curItem[1] ); 990 $this->ancestorStruct = isset( $this->items[$curNS][$curTag]['map_name'] ) 991 ? $this->items[$curNS][$curTag]['map_name'] : $curTag; 992 993 $this->doAttribs( $attribs ); 994 } else { 995 // Normal BAG or SEQ containing simple values. 996 array_unshift( $this->mode, self::MODE_SIMPLE ); 997 // need to add curItem[0] on again since one is for the specific item 998 // and one is for the entire group. 999 array_unshift( $this->curItem, $this->curItem[0] ); 1000 $this->processingArray = true; 1001 } 1002 } 1003 1004 /** 1005 * Opening element in MODE_LI_LANG. 1006 * process elements of language alternatives 1007 * 1008 * Example: 1009 * <dc:title> <rdf:Alt> <rdf:li xml:lang="x-default">My house 1010 * </rdf:li> </rdf:Alt> </dc:title> 1011 * 1012 * This method is called when we hit the <rdf:li> element. 1013 * 1014 * @param string $elm Namespace . ' ' . tag 1015 * @param array $attribs Array of elements (most importantly xml:lang) 1016 * @throws MWException If gets a tag other than <rdf:li> or if no xml:lang 1017 */ 1018 private function startElementModeLiLang( $elm, $attribs ) { 1019 if ( $elm !== self::NS_RDF . ' li' ) { 1020 throw new MWException( __METHOD__ . " <rdf:li> expected but got $elm." ); 1021 } 1022 if ( !isset( $attribs[self::NS_XML . ' lang'] ) 1023 || !preg_match( '/^[-A-Za-z0-9]{2,}$/D', $attribs[self::NS_XML . ' lang'] ) 1024 ) { 1025 throw new MWException( __METHOD__ 1026 . " <rdf:li> did not contain, or has invalid xml:lang attribute in lang alternative" ); 1027 } 1028 1029 // Lang is case-insensitive. 1030 $this->itemLang = strtolower( $attribs[self::NS_XML . ' lang'] ); 1031 1032 // need to add curItem[0] on again since one is for the specific item 1033 // and one is for the entire group. 1034 array_unshift( $this->curItem, $this->curItem[0] ); 1035 array_unshift( $this->mode, self::MODE_SIMPLE ); 1036 $this->processingArray = true; 1037 } 1038 1039 /** 1040 * Hits an opening element. 1041 * Generally just calls a helper based on what MODE we're in. 1042 * Also does some initial set up for the wrapper element 1043 * 1044 * @param XMLParser $parser 1045 * @param string $elm Namespace "<space>" element 1046 * @param array $attribs Attribute name => value 1047 * @throws MWException 1048 */ 1049 function startElement( $parser, $elm, $attribs ) { 1050 1051 if ( $elm === self::NS_RDF . ' RDF' 1052 || $elm === 'adobe:ns:meta/ xmpmeta' 1053 || $elm === 'adobe:ns:meta/ xapmeta' 1054 ) { 1055 /* ignore. */ 1056 return; 1057 } elseif ( $elm === self::NS_RDF . ' Description' ) { 1058 if ( count( $this->mode ) === 0 ) { 1059 // outer rdf:desc 1060 array_unshift( $this->mode, self::MODE_INITIAL ); 1061 } 1062 } elseif ( $elm === self::NS_RDF . ' type' ) { 1063 // This doesn't support rdf:type properly. 1064 // In practise I have yet to see a file that 1065 // uses this element, however it is mentioned 1066 // on page 25 of part 1 of the xmp standard. 1067 // 1068 // also it seems as if exiv2 and exiftool do not support 1069 // this either (That or I misunderstand the standard) 1070 wfDebugLog( 'XMP', __METHOD__ . ' Encountered <rdf:type> which isn\'t currently supported' ); 1071 } 1072 1073 if ( strpos( $elm, ' ' ) === false ) { 1074 // This probably shouldn't happen. 1075 wfDebugLog( 'XMP', __METHOD__ . " Encountered <$elm> which has no namespace. Skipping." ); 1076 1077 return; 1078 } 1079 1080 list( $ns, $tag ) = explode( ' ', $elm, 2 ); 1081 1082 if ( count( $this->mode ) === 0 ) { 1083 // This should not happen. 1084 throw new MWException( 'Error extracting XMP, ' 1085 . "encountered <$elm> with no mode" ); 1086 } 1087 1088 switch ( $this->mode[0] ) { 1089 case self::MODE_IGNORE: 1090 $this->startElementModeIgnore( $elm ); 1091 break; 1092 case self::MODE_SIMPLE: 1093 $this->startElementModeSimple( $elm, $attribs ); 1094 break; 1095 case self::MODE_INITIAL: 1096 $this->startElementModeInitial( $ns, $tag, $attribs ); 1097 break; 1098 case self::MODE_STRUCT: 1099 $this->startElementModeStruct( $ns, $tag, $attribs ); 1100 break; 1101 case self::MODE_BAG: 1102 case self::MODE_BAGSTRUCT: 1103 $this->startElementModeBag( $elm ); 1104 break; 1105 case self::MODE_SEQ: 1106 $this->startElementModeSeq( $elm ); 1107 break; 1108 case self::MODE_LANG: 1109 $this->startElementModeLang( $elm ); 1110 break; 1111 case self::MODE_LI_LANG: 1112 $this->startElementModeLiLang( $elm, $attribs ); 1113 break; 1114 case self::MODE_LI: 1115 $this->startElementModeLi( $elm, $attribs ); 1116 break; 1117 case self::MODE_QDESC: 1118 $this->startElementModeQDesc( $elm ); 1119 break; 1120 default: 1121 throw new MWException( 'StartElement in unknown mode: ' . $this->mode[0] ); 1122 } 1123 } 1124 1125 /** 1126 * Process attributes. 1127 * Simple values can be stored as either a tag or attribute 1128 * 1129 * Often the initial "<rdf:Description>" tag just has all the simple 1130 * properties as attributes. 1131 * 1132 * @codingStandardsIgnoreStart Long line that cannot be broken 1133 * @par Example: 1134 * @code 1135 * <rdf:Description rdf:about="" xmlns:exif="http://ns.adobe.com/exif/1.0/" exif:DigitalZoomRatio="0/10"> 1136 * @endcode 1137 * @codingStandardsIgnoreEnd 1138 * 1139 * @param array $attribs Array attribute=>value 1140 * @throws MWException 1141 */ 1142 private function doAttribs( $attribs ) { 1143 // first check for rdf:parseType attribute, as that can change 1144 // how the attributes are interperted. 1145 1146 if ( isset( $attribs[self::NS_RDF . ' parseType'] ) 1147 && $attribs[self::NS_RDF . ' parseType'] === 'Resource' 1148 && $this->mode[0] === self::MODE_SIMPLE 1149 ) { 1150 // this is equivalent to having an inner rdf:Description 1151 $this->mode[0] = self::MODE_QDESC; 1152 } 1153 foreach ( $attribs as $name => $val ) { 1154 if ( strpos( $name, ' ' ) === false ) { 1155 // This shouldn't happen, but so far some old software forgets namespace 1156 // on rdf:about. 1157 wfDebugLog( 'XMP', __METHOD__ . ' Encountered non-namespaced attribute: ' 1158 . " $name=\"$val\". Skipping. " ); 1159 continue; 1160 } 1161 list( $ns, $tag ) = explode( ' ', $name, 2 ); 1162 if ( $ns === self::NS_RDF ) { 1163 if ( $tag === 'value' || $tag === 'resource' ) { 1164 // resource is for url. 1165 // value attribute is a weird way of just putting the contents. 1166 $this->char( $this->xmlParser, $val ); 1167 } 1168 } elseif ( isset( $this->items[$ns][$tag] ) ) { 1169 if ( $this->mode[0] === self::MODE_SIMPLE ) { 1170 throw new MWException( __METHOD__ 1171 . " $ns:$tag found as attribute where not allowed" ); 1172 } 1173 $this->saveValue( $ns, $tag, $val ); 1174 } else { 1175 wfDebugLog( 'XMP', __METHOD__ . " Ignoring unrecognized element <$ns:$tag>." ); 1176 } 1177 } 1178 } 1179 1180 /** 1181 * Given an extracted value, save it to results array 1182 * 1183 * note also uses $this->ancestorStruct and 1184 * $this->processingArray to determine what name to 1185 * save the value under. (in addition to $tag). 1186 * 1187 * @param string $ns Namespace of tag this is for 1188 * @param string $tag Tag name 1189 * @param string $val Value to save 1190 */ 1191 private function saveValue( $ns, $tag, $val ) { 1192 1193 $info =& $this->items[$ns][$tag]; 1194 $finalName = isset( $info['map_name'] ) 1195 ? $info['map_name'] : $tag; 1196 if ( isset( $info['validate'] ) ) { 1197 $validate = is_array( $info['validate'] ) ? $info['validate'] 1198 : array( 'XMPValidate', $info['validate'] ); 1199 1200 if ( is_callable( $validate ) ) { 1201 call_user_func_array( $validate, array( $info, &$val, true ) ); 1202 // the reasoning behind using &$val instead of using the return value 1203 // is to be consistent between here and validating structures. 1204 if ( is_null( $val ) ) { 1205 wfDebugLog( 'XMP', __METHOD__ . " <$ns:$tag> failed validation." ); 1206 1207 return; 1208 } 1209 } else { 1210 wfDebugLog( 'XMP', __METHOD__ . " Validation function for $finalName (" 1211 . $validate[0] . '::' . $validate[1] . '()) is not callable.' ); 1212 } 1213 } 1214 1215 if ( $this->ancestorStruct && $this->processingArray ) { 1216 // Aka both an array and a struct. ( self::MODE_BAGSTRUCT ) 1217 $this->results['xmp-' . $info['map_group']][$this->ancestorStruct][][$finalName] = $val; 1218 } elseif ( $this->ancestorStruct ) { 1219 $this->results['xmp-' . $info['map_group']][$this->ancestorStruct][$finalName] = $val; 1220 } elseif ( $this->processingArray ) { 1221 if ( $this->itemLang === false ) { 1222 // normal array 1223 $this->results['xmp-' . $info['map_group']][$finalName][] = $val; 1224 } else { 1225 // lang array. 1226 $this->results['xmp-' . $info['map_group']][$finalName][$this->itemLang] = $val; 1227 } 1228 } else { 1229 $this->results['xmp-' . $info['map_group']][$finalName] = $val; 1230 } 1231 } 1232 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Fri Nov 28 14:03:12 2014 | Cross-referenced by PHPXref 0.7.1 |