MediaWiki
REL1_24
|
00001 <?php 00036 class DjVuImage { 00040 const DJVUTXT_MEMORY_LIMIT = 300000; 00041 00047 function __construct( $filename ) { 00048 $this->mFilename = $filename; 00049 } 00050 00055 public function isValid() { 00056 $info = $this->getInfo(); 00057 00058 return $info !== false; 00059 } 00060 00065 public function getImageSize() { 00066 $data = $this->getInfo(); 00067 00068 if ( $data !== false ) { 00069 $width = $data['width']; 00070 $height = $data['height']; 00071 00072 return array( $width, $height, 'DjVu', 00073 "width=\"$width\" height=\"$height\"" ); 00074 } 00075 00076 return false; 00077 } 00078 00079 // --------- 00080 00084 function dump() { 00085 $file = fopen( $this->mFilename, 'rb' ); 00086 $header = fread( $file, 12 ); 00087 // @todo FIXME: Would be good to replace this extract() call with 00088 // something that explicitly initializes local variables. 00089 extract( unpack( 'a4magic/a4chunk/NchunkLength', $header ) ); 00092 echo "$chunk $chunkLength\n"; 00093 $this->dumpForm( $file, $chunkLength, 1 ); 00094 fclose( $file ); 00095 } 00096 00097 private function dumpForm( $file, $length, $indent ) { 00098 $start = ftell( $file ); 00099 $secondary = fread( $file, 4 ); 00100 echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n"; 00101 while ( ftell( $file ) - $start < $length ) { 00102 $chunkHeader = fread( $file, 8 ); 00103 if ( $chunkHeader == '' ) { 00104 break; 00105 } 00106 // @todo FIXME: Would be good to replace this extract() call with 00107 // something that explicitly initializes local variables. 00108 extract( unpack( 'a4chunk/NchunkLength', $chunkHeader ) ); 00111 echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n"; 00112 00113 if ( $chunk == 'FORM' ) { 00114 $this->dumpForm( $file, $chunkLength, $indent + 1 ); 00115 } else { 00116 fseek( $file, $chunkLength, SEEK_CUR ); 00117 if ( $chunkLength & 1 == 1 ) { 00118 // Padding byte between chunks 00119 fseek( $file, 1, SEEK_CUR ); 00120 } 00121 } 00122 } 00123 } 00124 00125 function getInfo() { 00126 wfSuppressWarnings(); 00127 $file = fopen( $this->mFilename, 'rb' ); 00128 wfRestoreWarnings(); 00129 if ( $file === false ) { 00130 wfDebug( __METHOD__ . ": missing or failed file read\n" ); 00131 00132 return false; 00133 } 00134 00135 $header = fread( $file, 16 ); 00136 $info = false; 00137 00138 if ( strlen( $header ) < 16 ) { 00139 wfDebug( __METHOD__ . ": too short file header\n" ); 00140 } else { 00141 // @todo FIXME: Would be good to replace this extract() call with 00142 // something that explicitly initializes local variables. 00143 extract( unpack( 'a4magic/a4form/NformLength/a4subtype', $header ) ); 00144 00149 if ( $magic != 'AT&T' ) { 00150 wfDebug( __METHOD__ . ": not a DjVu file\n" ); 00151 } elseif ( $subtype == 'DJVU' ) { 00152 // Single-page document 00153 $info = $this->getPageInfo( $file, $formLength ); 00154 } elseif ( $subtype == 'DJVM' ) { 00155 // Multi-page document 00156 $info = $this->getMultiPageInfo( $file, $formLength ); 00157 } else { 00158 wfDebug( __METHOD__ . ": unrecognized DJVU file type '$formType'\n" ); 00159 } 00160 } 00161 fclose( $file ); 00162 00163 return $info; 00164 } 00165 00166 private function readChunk( $file ) { 00167 $header = fread( $file, 8 ); 00168 if ( strlen( $header ) < 8 ) { 00169 return array( false, 0 ); 00170 } else { 00171 // @todo FIXME: Would be good to replace this extract() call with 00172 // something that explicitly initializes local variables. 00173 extract( unpack( 'a4chunk/Nlength', $header ) ); 00174 00177 return array( $chunk, $length ); 00178 } 00179 } 00180 00181 private function skipChunk( $file, $chunkLength ) { 00182 fseek( $file, $chunkLength, SEEK_CUR ); 00183 00184 if ( $chunkLength & 0x01 == 1 && !feof( $file ) ) { 00185 // padding byte 00186 fseek( $file, 1, SEEK_CUR ); 00187 } 00188 } 00189 00190 private function getMultiPageInfo( $file, $formLength ) { 00191 // For now, we'll just look for the first page in the file 00192 // and report its information, hoping others are the same size. 00193 $start = ftell( $file ); 00194 do { 00195 list( $chunk, $length ) = $this->readChunk( $file ); 00196 if ( !$chunk ) { 00197 break; 00198 } 00199 00200 if ( $chunk == 'FORM' ) { 00201 $subtype = fread( $file, 4 ); 00202 if ( $subtype == 'DJVU' ) { 00203 wfDebug( __METHOD__ . ": found first subpage\n" ); 00204 00205 return $this->getPageInfo( $file, $length ); 00206 } 00207 $this->skipChunk( $file, $length - 4 ); 00208 } else { 00209 wfDebug( __METHOD__ . ": skipping '$chunk' chunk\n" ); 00210 $this->skipChunk( $file, $length ); 00211 } 00212 } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength ); 00213 00214 wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages\n" ); 00215 00216 return false; 00217 } 00218 00219 private function getPageInfo( $file, $formLength ) { 00220 list( $chunk, $length ) = $this->readChunk( $file ); 00221 if ( $chunk != 'INFO' ) { 00222 wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'\n" ); 00223 00224 return false; 00225 } 00226 00227 if ( $length < 9 ) { 00228 wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length\n" ); 00229 00230 return false; 00231 } 00232 $data = fread( $file, $length ); 00233 if ( strlen( $data ) < $length ) { 00234 wfDebug( __METHOD__ . ": INFO chunk cut off\n" ); 00235 00236 return false; 00237 } 00238 00239 // @todo FIXME: Would be good to replace this extract() call with 00240 // something that explicitly initializes local variables. 00241 extract( unpack( 00242 'nwidth/' . 00243 'nheight/' . 00244 'Cminor/' . 00245 'Cmajor/' . 00246 'vresolution/' . 00247 'Cgamma', $data ) ); 00248 00249 # Newer files have rotation info in byte 10, but we don't use it yet. 00250 00258 return array( 00259 'width' => $width, 00260 'height' => $height, 00261 'version' => "$major.$minor", 00262 'resolution' => $resolution, 00263 'gamma' => $gamma / 10.0 ); 00264 } 00265 00270 function retrieveMetaData() { 00271 global $wgDjvuToXML, $wgDjvuDump, $wgDjvuTxt; 00272 wfProfileIn( __METHOD__ ); 00273 00274 if ( isset( $wgDjvuDump ) ) { 00275 # djvudump is faster as of version 3.5 00276 # http://sourceforge.net/tracker/index.php?func=detail&aid=1704049&group_id=32953&atid=406583 00277 wfProfileIn( 'djvudump' ); 00278 $cmd = wfEscapeShellArg( $wgDjvuDump ) . ' ' . wfEscapeShellArg( $this->mFilename ); 00279 $dump = wfShellExec( $cmd ); 00280 $xml = $this->convertDumpToXML( $dump ); 00281 wfProfileOut( 'djvudump' ); 00282 } elseif ( isset( $wgDjvuToXML ) ) { 00283 wfProfileIn( 'djvutoxml' ); 00284 $cmd = wfEscapeShellArg( $wgDjvuToXML ) . ' --without-anno --without-text ' . 00285 wfEscapeShellArg( $this->mFilename ); 00286 $xml = wfShellExec( $cmd ); 00287 wfProfileOut( 'djvutoxml' ); 00288 } else { 00289 $xml = null; 00290 } 00291 # Text layer 00292 if ( isset( $wgDjvuTxt ) ) { 00293 wfProfileIn( 'djvutxt' ); 00294 $cmd = wfEscapeShellArg( $wgDjvuTxt ) . ' --detail=page ' . wfEscapeShellArg( $this->mFilename ); 00295 wfDebug( __METHOD__ . ": $cmd\n" ); 00296 $retval = ''; 00297 $txt = wfShellExec( $cmd, $retval, array(), array( 'memory' => self::DJVUTXT_MEMORY_LIMIT ) ); 00298 wfProfileOut( 'djvutxt' ); 00299 if ( $retval == 0 ) { 00300 # Strip some control characters 00301 $txt = preg_replace( "/[\013\035\037]/", "", $txt ); 00302 $reg = <<<EOR 00303 /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*" 00304 ((?> # Text to match is composed of atoms of either: 00305 \\\\. # - any escaped character 00306 | # - any character different from " and \ 00307 [^"\\\\]+ 00308 )*?) 00309 "\s*\) 00310 | # Or page can be empty ; in this case, djvutxt dumps () 00311 \(\s*()\)/sx 00312 EOR; 00313 $txt = preg_replace_callback( $reg, array( $this, 'pageTextCallback' ), $txt ); 00314 $txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n"; 00315 $xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml, 1 ); 00316 $xml = $xml . $txt . '</mw-djvu>'; 00317 } 00318 } 00319 wfProfileOut( __METHOD__ ); 00320 00321 return $xml; 00322 } 00323 00324 function pageTextCallback( $matches ) { 00325 # Get rid of invalid UTF-8, strip control characters 00326 $val = htmlspecialchars( UtfNormal::cleanUp( stripcslashes( $matches[1] ) ) ); 00327 $val = str_replace( array( "\n", '�' ), array( ' ', '' ), $val ); 00328 return '<PAGE value="' . $val . '" />'; 00329 } 00330 00336 function convertDumpToXML( $dump ) { 00337 if ( strval( $dump ) == '' ) { 00338 return false; 00339 } 00340 00341 $xml = <<<EOT 00342 <?xml version="1.0" ?> 00343 <!DOCTYPE DjVuXML PUBLIC "-//W3C//DTD DjVuXML 1.1//EN" "pubtext/DjVuXML-s.dtd"> 00344 <DjVuXML> 00345 <HEAD></HEAD> 00346 <BODY> 00347 EOT; 00348 00349 $dump = str_replace( "\r", '', $dump ); 00350 $line = strtok( $dump, "\n" ); 00351 $m = false; 00352 $good = false; 00353 if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) { 00354 # Single-page 00355 if ( $this->parseFormDjvu( $line, $xml ) ) { 00356 $good = true; 00357 } else { 00358 return false; 00359 } 00360 } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) { 00361 # Multi-page 00362 $parentLevel = strlen( $m[1] ); 00363 # Find DIRM 00364 $line = strtok( "\n" ); 00365 while ( $line !== false ) { 00366 $childLevel = strspn( $line, ' ' ); 00367 if ( $childLevel <= $parentLevel ) { 00368 # End of chunk 00369 break; 00370 } 00371 00372 if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) { 00373 wfDebug( "Indirect multi-page DjVu document, bad for server!\n" ); 00374 00375 return false; 00376 } 00377 if ( preg_match( '/^ *FORM:DJVU/', $line ) ) { 00378 # Found page 00379 if ( $this->parseFormDjvu( $line, $xml ) ) { 00380 $good = true; 00381 } else { 00382 return false; 00383 } 00384 } 00385 $line = strtok( "\n" ); 00386 } 00387 } 00388 if ( !$good ) { 00389 return false; 00390 } 00391 00392 $xml .= "</BODY>\n</DjVuXML>\n"; 00393 00394 return $xml; 00395 } 00396 00397 function parseFormDjvu( $line, &$xml ) { 00398 $parentLevel = strspn( $line, ' ' ); 00399 $line = strtok( "\n" ); 00400 00401 # Find INFO 00402 while ( $line !== false ) { 00403 $childLevel = strspn( $line, ' ' ); 00404 if ( $childLevel <= $parentLevel ) { 00405 # End of chunk 00406 break; 00407 } 00408 00409 if ( preg_match( 00410 '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/', 00411 $line, 00412 $m 00413 ) ) { 00414 $xml .= Xml::tags( 00415 'OBJECT', 00416 array( 00417 #'data' => '', 00418 #'type' => 'image/x.djvu', 00419 'height' => $m[2], 00420 'width' => $m[1], 00421 #'usemap' => '', 00422 ), 00423 "\n" . 00424 Xml::element( 'PARAM', array( 'name' => 'DPI', 'value' => $m[3] ) ) . "\n" . 00425 Xml::element( 'PARAM', array( 'name' => 'GAMMA', 'value' => $m[4] ) ) . "\n" 00426 ) . "\n"; 00427 00428 return true; 00429 } 00430 $line = strtok( "\n" ); 00431 } 00432 00433 # Not found 00434 return false; 00435 } 00436 }