[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/includes/media/ -> DjVuImage.php (source)

   1  <?php
   2  /**
   3   * DjVu image handler.
   4   *
   5   * Copyright © 2006 Brion Vibber <[email protected]>
   6   * https://www.mediawiki.org/
   7   *
   8   * This program is free software; you can redistribute it and/or modify
   9   * it under the terms of the GNU General Public License as published by
  10   * the Free Software Foundation; either version 2 of the License, or
  11   * (at your option) any later version.
  12   *
  13   * This program is distributed in the hope that it will be useful,
  14   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16   * GNU General Public License for more details.
  17   *
  18   * You should have received a copy of the GNU General Public License along
  19   * with this program; if not, write to the Free Software Foundation, Inc.,
  20   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21   * http://www.gnu.org/copyleft/gpl.html
  22   *
  23   * @file
  24   * @ingroup Media
  25   */
  26  
  27  /**
  28   * Support for detecting/validating DjVu image files and getting
  29   * some basic file metadata (resolution etc)
  30   *
  31   * File format docs are available in source package for DjVuLibre:
  32   * http://djvulibre.djvuzone.org/
  33   *
  34   * @ingroup Media
  35   */
  36  class DjVuImage {
  37      /**
  38       * @const DJVUTXT_MEMORY_LIMIT Memory limit for the DjVu description software
  39       */
  40      const DJVUTXT_MEMORY_LIMIT = 300000;
  41  
  42      /**
  43       * Constructor
  44       *
  45       * @param string $filename The DjVu file name.
  46       */
  47  	function __construct( $filename ) {
  48          $this->mFilename = $filename;
  49      }
  50  
  51      /**
  52       * Check if the given file is indeed a valid DjVu image file
  53       * @return bool
  54       */
  55  	public function isValid() {
  56          $info = $this->getInfo();
  57  
  58          return $info !== false;
  59      }
  60  
  61      /**
  62       * Return data in the style of getimagesize()
  63       * @return array|bool Array or false on failure
  64       */
  65  	public function getImageSize() {
  66          $data = $this->getInfo();
  67  
  68          if ( $data !== false ) {
  69              $width = $data['width'];
  70              $height = $data['height'];
  71  
  72              return array( $width, $height, 'DjVu',
  73                  "width=\"$width\" height=\"$height\"" );
  74          }
  75  
  76          return false;
  77      }
  78  
  79      // ---------
  80  
  81      /**
  82       * For debugging; dump the IFF chunk structure
  83       */
  84  	function dump() {
  85          $file = fopen( $this->mFilename, 'rb' );
  86          $header = fread( $file, 12 );
  87          // @todo FIXME: Would be good to replace this extract() call with
  88          // something that explicitly initializes local variables.
  89          extract( unpack( 'a4magic/a4chunk/NchunkLength', $header ) );
  90          /** @var string $chunk
  91           * @var string $chunkLength */
  92          echo "$chunk $chunkLength\n";
  93          $this->dumpForm( $file, $chunkLength, 1 );
  94          fclose( $file );
  95      }
  96  
  97  	private function dumpForm( $file, $length, $indent ) {
  98          $start = ftell( $file );
  99          $secondary = fread( $file, 4 );
 100          echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n";
 101          while ( ftell( $file ) - $start < $length ) {
 102              $chunkHeader = fread( $file, 8 );
 103              if ( $chunkHeader == '' ) {
 104                  break;
 105              }
 106              // @todo FIXME: Would be good to replace this extract() call with
 107              // something that explicitly initializes local variables.
 108              extract( unpack( 'a4chunk/NchunkLength', $chunkHeader ) );
 109              /** @var string $chunk
 110               * @var string $chunkLength */
 111              echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n";
 112  
 113              if ( $chunk == 'FORM' ) {
 114                  $this->dumpForm( $file, $chunkLength, $indent + 1 );
 115              } else {
 116                  fseek( $file, $chunkLength, SEEK_CUR );
 117                  if ( $chunkLength & 1 == 1 ) {
 118                      // Padding byte between chunks
 119                      fseek( $file, 1, SEEK_CUR );
 120                  }
 121              }
 122          }
 123      }
 124  
 125  	function getInfo() {
 126          wfSuppressWarnings();
 127          $file = fopen( $this->mFilename, 'rb' );
 128          wfRestoreWarnings();
 129          if ( $file === false ) {
 130              wfDebug( __METHOD__ . ": missing or failed file read\n" );
 131  
 132              return false;
 133          }
 134  
 135          $header = fread( $file, 16 );
 136          $info = false;
 137  
 138          if ( strlen( $header ) < 16 ) {
 139              wfDebug( __METHOD__ . ": too short file header\n" );
 140          } else {
 141              // @todo FIXME: Would be good to replace this extract() call with
 142              // something that explicitly initializes local variables.
 143              extract( unpack( 'a4magic/a4form/NformLength/a4subtype', $header ) );
 144  
 145              /** @var string $magic
 146               * @var string $subtype
 147               * @var string $formLength
 148               * @var string $formType */
 149              if ( $magic != 'AT&T' ) {
 150                  wfDebug( __METHOD__ . ": not a DjVu file\n" );
 151              } elseif ( $subtype == 'DJVU' ) {
 152                  // Single-page document
 153                  $info = $this->getPageInfo( $file, $formLength );
 154              } elseif ( $subtype == 'DJVM' ) {
 155                  // Multi-page document
 156                  $info = $this->getMultiPageInfo( $file, $formLength );
 157              } else {
 158                  wfDebug( __METHOD__ . ": unrecognized DJVU file type '$formType'\n" );
 159              }
 160          }
 161          fclose( $file );
 162  
 163          return $info;
 164      }
 165  
 166  	private function readChunk( $file ) {
 167          $header = fread( $file, 8 );
 168          if ( strlen( $header ) < 8 ) {
 169              return array( false, 0 );
 170          } else {
 171              // @todo FIXME: Would be good to replace this extract() call with
 172              // something that explicitly initializes local variables.
 173              extract( unpack( 'a4chunk/Nlength', $header ) );
 174  
 175              /** @var string $chunk
 176               * @var string $length */
 177              return array( $chunk, $length );
 178          }
 179      }
 180  
 181  	private function skipChunk( $file, $chunkLength ) {
 182          fseek( $file, $chunkLength, SEEK_CUR );
 183  
 184          if ( $chunkLength & 0x01 == 1 && !feof( $file ) ) {
 185              // padding byte
 186              fseek( $file, 1, SEEK_CUR );
 187          }
 188      }
 189  
 190  	private function getMultiPageInfo( $file, $formLength ) {
 191          // For now, we'll just look for the first page in the file
 192          // and report its information, hoping others are the same size.
 193          $start = ftell( $file );
 194          do {
 195              list( $chunk, $length ) = $this->readChunk( $file );
 196              if ( !$chunk ) {
 197                  break;
 198              }
 199  
 200              if ( $chunk == 'FORM' ) {
 201                  $subtype = fread( $file, 4 );
 202                  if ( $subtype == 'DJVU' ) {
 203                      wfDebug( __METHOD__ . ": found first subpage\n" );
 204  
 205                      return $this->getPageInfo( $file, $length );
 206                  }
 207                  $this->skipChunk( $file, $length - 4 );
 208              } else {
 209                  wfDebug( __METHOD__ . ": skipping '$chunk' chunk\n" );
 210                  $this->skipChunk( $file, $length );
 211              }
 212          } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength );
 213  
 214          wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages\n" );
 215  
 216          return false;
 217      }
 218  
 219  	private function getPageInfo( $file, $formLength ) {
 220          list( $chunk, $length ) = $this->readChunk( $file );
 221          if ( $chunk != 'INFO' ) {
 222              wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'\n" );
 223  
 224              return false;
 225          }
 226  
 227          if ( $length < 9 ) {
 228              wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length\n" );
 229  
 230              return false;
 231          }
 232          $data = fread( $file, $length );
 233          if ( strlen( $data ) < $length ) {
 234              wfDebug( __METHOD__ . ": INFO chunk cut off\n" );
 235  
 236              return false;
 237          }
 238  
 239          // @todo FIXME: Would be good to replace this extract() call with
 240          // something that explicitly initializes local variables.
 241          extract( unpack(
 242              'nwidth/' .
 243              'nheight/' .
 244              'Cminor/' .
 245              'Cmajor/' .
 246              'vresolution/' .
 247              'Cgamma', $data ) );
 248  
 249          # Newer files have rotation info in byte 10, but we don't use it yet.
 250  
 251          /** @var string $width
 252           * @var string $height
 253           * @var string $major
 254           * @var string $minor
 255           * @var string $resolution
 256           * @var string $length
 257           * @var string $gamma */
 258          return array(
 259              'width' => $width,
 260              'height' => $height,
 261              'version' => "$major.$minor",
 262              'resolution' => $resolution,
 263              'gamma' => $gamma / 10.0 );
 264      }
 265  
 266      /**
 267       * Return an XML string describing the DjVu image
 268       * @return string
 269       */
 270  	function retrieveMetaData() {
 271          global $wgDjvuToXML, $wgDjvuDump, $wgDjvuTxt;
 272          wfProfileIn( __METHOD__ );
 273  
 274          if ( isset( $wgDjvuDump ) ) {
 275              # djvudump is faster as of version 3.5
 276              # http://sourceforge.net/tracker/index.php?func=detail&aid=1704049&group_id=32953&atid=406583
 277              wfProfileIn( 'djvudump' );
 278              $cmd = wfEscapeShellArg( $wgDjvuDump ) . ' ' . wfEscapeShellArg( $this->mFilename );
 279              $dump = wfShellExec( $cmd );
 280              $xml = $this->convertDumpToXML( $dump );
 281              wfProfileOut( 'djvudump' );
 282          } elseif ( isset( $wgDjvuToXML ) ) {
 283              wfProfileIn( 'djvutoxml' );
 284              $cmd = wfEscapeShellArg( $wgDjvuToXML ) . ' --without-anno --without-text ' .
 285                  wfEscapeShellArg( $this->mFilename );
 286              $xml = wfShellExec( $cmd );
 287              wfProfileOut( 'djvutoxml' );
 288          } else {
 289              $xml = null;
 290          }
 291          # Text layer
 292          if ( isset( $wgDjvuTxt ) ) {
 293              wfProfileIn( 'djvutxt' );
 294              $cmd = wfEscapeShellArg( $wgDjvuTxt ) . ' --detail=page ' . wfEscapeShellArg( $this->mFilename );
 295              wfDebug( __METHOD__ . ": $cmd\n" );
 296              $retval = '';
 297              $txt = wfShellExec( $cmd, $retval, array(), array( 'memory' => self::DJVUTXT_MEMORY_LIMIT ) );
 298              wfProfileOut( 'djvutxt' );
 299              if ( $retval == 0 ) {
 300                  # Strip some control characters
 301                  $txt = preg_replace( "/[\013\035\037]/", "", $txt );
 302                  $reg = <<<EOR
 303                      /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
 304                      ((?>    # Text to match is composed of atoms of either:
 305                        \\\\. # - any escaped character
 306                        |     # - any character different from " and \
 307                        [^"\\\\]+
 308                      )*?)
 309                      "\s*\)
 310                      | # Or page can be empty ; in this case, djvutxt dumps ()
 311                      \(\s*()\)/sx
 312  EOR;
 313                  $txt = preg_replace_callback( $reg, array( $this, 'pageTextCallback' ), $txt );
 314                  $txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n";
 315                  $xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml, 1 );
 316                  $xml = $xml . $txt . '</mw-djvu>';
 317              }
 318          }
 319          wfProfileOut( __METHOD__ );
 320  
 321          return $xml;
 322      }
 323  
 324  	function pageTextCallback( $matches ) {
 325          # Get rid of invalid UTF-8, strip control characters
 326          $val = htmlspecialchars( UtfNormal::cleanUp( stripcslashes( $matches[1] ) ) );
 327          $val = str_replace( array( "\n", '�' ), array( '&#10;', '' ), $val );
 328          return '<PAGE value="' . $val . '" />';
 329      }
 330  
 331      /**
 332       * Hack to temporarily work around djvutoxml bug
 333       * @param string $dump
 334       * @return string
 335       */
 336  	function convertDumpToXML( $dump ) {
 337          if ( strval( $dump ) == '' ) {
 338              return false;
 339          }
 340  
 341          $xml = <<<EOT
 342  <?xml version="1.0" ?>
 343  <!DOCTYPE DjVuXML PUBLIC "-//W3C//DTD DjVuXML 1.1//EN" "pubtext/DjVuXML-s.dtd">
 344  <DjVuXML>
 345  <HEAD></HEAD>
 346  <BODY>
 347  EOT;
 348  
 349          $dump = str_replace( "\r", '', $dump );
 350          $line = strtok( $dump, "\n" );
 351          $m = false;
 352          $good = false;
 353          if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) {
 354              # Single-page
 355              if ( $this->parseFormDjvu( $line, $xml ) ) {
 356                  $good = true;
 357              } else {
 358                  return false;
 359              }
 360          } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) {
 361              # Multi-page
 362              $parentLevel = strlen( $m[1] );
 363              # Find DIRM
 364              $line = strtok( "\n" );
 365              while ( $line !== false ) {
 366                  $childLevel = strspn( $line, ' ' );
 367                  if ( $childLevel <= $parentLevel ) {
 368                      # End of chunk
 369                      break;
 370                  }
 371  
 372                  if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) {
 373                      wfDebug( "Indirect multi-page DjVu document, bad for server!\n" );
 374  
 375                      return false;
 376                  }
 377                  if ( preg_match( '/^ *FORM:DJVU/', $line ) ) {
 378                      # Found page
 379                      if ( $this->parseFormDjvu( $line, $xml ) ) {
 380                          $good = true;
 381                      } else {
 382                          return false;
 383                      }
 384                  }
 385                  $line = strtok( "\n" );
 386              }
 387          }
 388          if ( !$good ) {
 389              return false;
 390          }
 391  
 392          $xml .= "</BODY>\n</DjVuXML>\n";
 393  
 394          return $xml;
 395      }
 396  
 397  	function parseFormDjvu( $line, &$xml ) {
 398          $parentLevel = strspn( $line, ' ' );
 399          $line = strtok( "\n" );
 400  
 401          # Find INFO
 402          while ( $line !== false ) {
 403              $childLevel = strspn( $line, ' ' );
 404              if ( $childLevel <= $parentLevel ) {
 405                  # End of chunk
 406                  break;
 407              }
 408  
 409              if ( preg_match(
 410                  '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/',
 411                  $line,
 412                  $m
 413              ) ) {
 414                  $xml .= Xml::tags(
 415                      'OBJECT',
 416                      array(
 417                          #'data' => '',
 418                          #'type' => 'image/x.djvu',
 419                          'height' => $m[2],
 420                          'width' => $m[1],
 421                          #'usemap' => '',
 422                      ),
 423                      "\n" .
 424                          Xml::element( 'PARAM', array( 'name' => 'DPI', 'value' => $m[3] ) ) . "\n" .
 425                          Xml::element( 'PARAM', array( 'name' => 'GAMMA', 'value' => $m[4] ) ) . "\n"
 426                  ) . "\n";
 427  
 428                  return true;
 429              }
 430              $line = strtok( "\n" );
 431          }
 432  
 433          # Not found
 434          return false;
 435      }
 436  }


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1