MediaWiki
REL1_22
|
00001 <?php 00036 class DjVuImage { 00042 function __construct( $filename ) { 00043 $this->mFilename = $filename; 00044 } 00045 00049 const DJVUTXT_MEMORY_LIMIT = 300000; 00050 00055 public function isValid() { 00056 $info = $this->getInfo(); 00057 return $info !== false; 00058 } 00059 00064 public function getImageSize() { 00065 $data = $this->getInfo(); 00066 00067 if ( $data !== false ) { 00068 $width = $data['width']; 00069 $height = $data['height']; 00070 00071 return array( $width, $height, 'DjVu', 00072 "width=\"$width\" height=\"$height\"" ); 00073 } 00074 return false; 00075 } 00076 00077 // --------- 00078 00082 function dump() { 00083 $file = fopen( $this->mFilename, 'rb' ); 00084 $header = fread( $file, 12 ); 00085 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00086 extract( unpack( 'a4magic/a4chunk/NchunkLength', $header ) ); 00087 echo "$chunk $chunkLength\n"; 00088 $this->dumpForm( $file, $chunkLength, 1 ); 00089 fclose( $file ); 00090 } 00091 00092 private function dumpForm( $file, $length, $indent ) { 00093 $start = ftell( $file ); 00094 $secondary = fread( $file, 4 ); 00095 echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n"; 00096 while ( ftell( $file ) - $start < $length ) { 00097 $chunkHeader = fread( $file, 8 ); 00098 if ( $chunkHeader == '' ) { 00099 break; 00100 } 00101 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00102 extract( unpack( 'a4chunk/NchunkLength', $chunkHeader ) ); 00103 echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n"; 00104 00105 if ( $chunk == 'FORM' ) { 00106 $this->dumpForm( $file, $chunkLength, $indent + 1 ); 00107 } else { 00108 fseek( $file, $chunkLength, SEEK_CUR ); 00109 if ( $chunkLength & 1 == 1 ) { 00110 // Padding byte between chunks 00111 fseek( $file, 1, SEEK_CUR ); 00112 } 00113 } 00114 } 00115 } 00116 00117 function getInfo() { 00118 wfSuppressWarnings(); 00119 $file = fopen( $this->mFilename, 'rb' ); 00120 wfRestoreWarnings(); 00121 if ( $file === false ) { 00122 wfDebug( __METHOD__ . ": missing or failed file read\n" ); 00123 return false; 00124 } 00125 00126 $header = fread( $file, 16 ); 00127 $info = false; 00128 00129 if ( strlen( $header ) < 16 ) { 00130 wfDebug( __METHOD__ . ": too short file header\n" ); 00131 } else { 00132 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00133 extract( unpack( 'a4magic/a4form/NformLength/a4subtype', $header ) ); 00134 00135 if ( $magic != 'AT&T' ) { 00136 wfDebug( __METHOD__ . ": not a DjVu file\n" ); 00137 } elseif ( $subtype == 'DJVU' ) { 00138 // Single-page document 00139 $info = $this->getPageInfo( $file, $formLength ); 00140 } elseif ( $subtype == 'DJVM' ) { 00141 // Multi-page document 00142 $info = $this->getMultiPageInfo( $file, $formLength ); 00143 } else { 00144 wfDebug( __METHOD__ . ": unrecognized DJVU file type '$formType'\n" ); 00145 } 00146 } 00147 fclose( $file ); 00148 return $info; 00149 } 00150 00151 private function readChunk( $file ) { 00152 $header = fread( $file, 8 ); 00153 if ( strlen( $header ) < 8 ) { 00154 return array( false, 0 ); 00155 } else { 00156 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00157 extract( unpack( 'a4chunk/Nlength', $header ) ); 00158 return array( $chunk, $length ); 00159 } 00160 } 00161 00162 private function skipChunk( $file, $chunkLength ) { 00163 fseek( $file, $chunkLength, SEEK_CUR ); 00164 00165 if ( $chunkLength & 0x01 == 1 && !feof( $file ) ) { 00166 // padding byte 00167 fseek( $file, 1, SEEK_CUR ); 00168 } 00169 } 00170 00171 private function getMultiPageInfo( $file, $formLength ) { 00172 // For now, we'll just look for the first page in the file 00173 // and report its information, hoping others are the same size. 00174 $start = ftell( $file ); 00175 do { 00176 list( $chunk, $length ) = $this->readChunk( $file ); 00177 if ( !$chunk ) { 00178 break; 00179 } 00180 00181 if ( $chunk == 'FORM' ) { 00182 $subtype = fread( $file, 4 ); 00183 if ( $subtype == 'DJVU' ) { 00184 wfDebug( __METHOD__ . ": found first subpage\n" ); 00185 return $this->getPageInfo( $file, $length ); 00186 } 00187 $this->skipChunk( $file, $length - 4 ); 00188 } else { 00189 wfDebug( __METHOD__ . ": skipping '$chunk' chunk\n" ); 00190 $this->skipChunk( $file, $length ); 00191 } 00192 } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength ); 00193 00194 wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages\n" ); 00195 return false; 00196 } 00197 00198 private function getPageInfo( $file, $formLength ) { 00199 list( $chunk, $length ) = $this->readChunk( $file ); 00200 if ( $chunk != 'INFO' ) { 00201 wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'\n" ); 00202 return false; 00203 } 00204 00205 if ( $length < 9 ) { 00206 wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length\n" ); 00207 return false; 00208 } 00209 $data = fread( $file, $length ); 00210 if ( strlen( $data ) < $length ) { 00211 wfDebug( __METHOD__ . ": INFO chunk cut off\n" ); 00212 return false; 00213 } 00214 00215 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00216 extract( unpack( 00217 'nwidth/' . 00218 'nheight/' . 00219 'Cminor/' . 00220 'Cmajor/' . 00221 'vresolution/' . 00222 'Cgamma', $data ) ); 00223 # Newer files have rotation info in byte 10, but we don't use it yet. 00224 00225 return array( 00226 'width' => $width, 00227 'height' => $height, 00228 'version' => "$major.$minor", 00229 'resolution' => $resolution, 00230 'gamma' => $gamma / 10.0 ); 00231 } 00232 00237 function retrieveMetaData() { 00238 global $wgDjvuToXML, $wgDjvuDump, $wgDjvuTxt; 00239 wfProfileIn( __METHOD__ ); 00240 00241 if ( isset( $wgDjvuDump ) ) { 00242 # djvudump is faster as of version 3.5 00243 # http://sourceforge.net/tracker/index.php?func=detail&aid=1704049&group_id=32953&atid=406583 00244 wfProfileIn( 'djvudump' ); 00245 $cmd = wfEscapeShellArg( $wgDjvuDump ) . ' ' . wfEscapeShellArg( $this->mFilename ); 00246 $dump = wfShellExec( $cmd ); 00247 $xml = $this->convertDumpToXML( $dump ); 00248 wfProfileOut( 'djvudump' ); 00249 } elseif ( isset( $wgDjvuToXML ) ) { 00250 wfProfileIn( 'djvutoxml' ); 00251 $cmd = wfEscapeShellArg( $wgDjvuToXML ) . ' --without-anno --without-text ' . 00252 wfEscapeShellArg( $this->mFilename ); 00253 $xml = wfShellExec( $cmd ); 00254 wfProfileOut( 'djvutoxml' ); 00255 } else { 00256 $xml = null; 00257 } 00258 # Text layer 00259 if ( isset( $wgDjvuTxt ) ) { 00260 wfProfileIn( 'djvutxt' ); 00261 $cmd = wfEscapeShellArg( $wgDjvuTxt ) . ' --detail=page ' . wfEscapeShellArg( $this->mFilename ); 00262 wfDebug( __METHOD__ . ": $cmd\n" ); 00263 $retval = ''; 00264 $txt = wfShellExec( $cmd, $retval, array(), array( 'memory' => self::DJVUTXT_MEMORY_LIMIT ) ); 00265 wfProfileOut( 'djvutxt' ); 00266 if ( $retval == 0 ) { 00267 # Strip some control characters 00268 $txt = preg_replace( "/[\013\035\037]/", "", $txt ); 00269 $reg = <<<EOR 00270 /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*" 00271 ((?> # Text to match is composed of atoms of either: 00272 \\\\. # - any escaped character 00273 | # - any character different from " and \ 00274 [^"\\\\]+ 00275 )*?) 00276 "\s*\) 00277 | # Or page can be empty ; in this case, djvutxt dumps () 00278 \(\s*()\)/sx 00279 EOR; 00280 $txt = preg_replace_callback( $reg, array( $this, 'pageTextCallback' ), $txt ); 00281 $txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n"; 00282 $xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml, 1 ); 00283 $xml = $xml . $txt . '</mw-djvu>'; 00284 } 00285 } 00286 wfProfileOut( __METHOD__ ); 00287 return $xml; 00288 } 00289 00290 function pageTextCallback( $matches ) { 00291 # Get rid of invalid UTF-8, strip control characters 00292 return '<PAGE value="' . htmlspecialchars( UtfNormal::cleanUp( $matches[1] ) ) . '" />'; 00293 } 00294 00299 function convertDumpToXML( $dump ) { 00300 if ( strval( $dump ) == '' ) { 00301 return false; 00302 } 00303 00304 $xml = <<<EOT 00305 <?xml version="1.0" ?> 00306 <!DOCTYPE DjVuXML PUBLIC "-//W3C//DTD DjVuXML 1.1//EN" "pubtext/DjVuXML-s.dtd"> 00307 <DjVuXML> 00308 <HEAD></HEAD> 00309 <BODY> 00310 EOT; 00311 00312 $dump = str_replace( "\r", '', $dump ); 00313 $line = strtok( $dump, "\n" ); 00314 $m = false; 00315 $good = false; 00316 if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) { 00317 # Single-page 00318 if ( $this->parseFormDjvu( $line, $xml ) ) { 00319 $good = true; 00320 } else { 00321 return false; 00322 } 00323 } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) { 00324 # Multi-page 00325 $parentLevel = strlen( $m[1] ); 00326 # Find DIRM 00327 $line = strtok( "\n" ); 00328 while ( $line !== false ) { 00329 $childLevel = strspn( $line, ' ' ); 00330 if ( $childLevel <= $parentLevel ) { 00331 # End of chunk 00332 break; 00333 } 00334 00335 if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) { 00336 wfDebug( "Indirect multi-page DjVu document, bad for server!\n" ); 00337 return false; 00338 } 00339 if ( preg_match( '/^ *FORM:DJVU/', $line ) ) { 00340 # Found page 00341 if ( $this->parseFormDjvu( $line, $xml ) ) { 00342 $good = true; 00343 } else { 00344 return false; 00345 } 00346 } 00347 $line = strtok( "\n" ); 00348 } 00349 } 00350 if ( !$good ) { 00351 return false; 00352 } 00353 00354 $xml .= "</BODY>\n</DjVuXML>\n"; 00355 return $xml; 00356 } 00357 00358 function parseFormDjvu( $line, &$xml ) { 00359 $parentLevel = strspn( $line, ' ' ); 00360 $line = strtok( "\n" ); 00361 00362 # Find INFO 00363 while ( $line !== false ) { 00364 $childLevel = strspn( $line, ' ' ); 00365 if ( $childLevel <= $parentLevel ) { 00366 # End of chunk 00367 break; 00368 } 00369 00370 if ( preg_match( '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/', $line, $m ) ) { 00371 $xml .= Xml::tags( 'OBJECT', 00372 array( 00373 #'data' => '', 00374 #'type' => 'image/x.djvu', 00375 'height' => $m[2], 00376 'width' => $m[1], 00377 #'usemap' => '', 00378 ), 00379 "\n" . 00380 Xml::element( 'PARAM', array( 'name' => 'DPI', 'value' => $m[3] ) ) . "\n" . 00381 Xml::element( 'PARAM', array( 'name' => 'GAMMA', 'value' => $m[4] ) ) . "\n" 00382 ) . "\n"; 00383 return true; 00384 } 00385 $line = strtok( "\n" ); 00386 } 00387 # Not found 00388 return false; 00389 } 00390 }