MediaWiki
REL1_19
|
00001 <?php 00035 class DjVuImage { 00036 function __construct( $filename ) { 00037 $this->mFilename = $filename; 00038 } 00039 00044 public function isValid() { 00045 $info = $this->getInfo(); 00046 return $info !== false; 00047 } 00048 00049 00054 public function getImageSize() { 00055 $data = $this->getInfo(); 00056 00057 if( $data !== false ) { 00058 $width = $data['width']; 00059 $height = $data['height']; 00060 00061 return array( $width, $height, 'DjVu', 00062 "width=\"$width\" height=\"$height\"" ); 00063 } 00064 return false; 00065 } 00066 00067 // --------- 00068 00072 function dump() { 00073 $file = fopen( $this->mFilename, 'rb' ); 00074 $header = fread( $file, 12 ); 00075 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00076 extract( unpack( 'a4magic/a4chunk/NchunkLength', $header ) ); 00077 echo "$chunk $chunkLength\n"; 00078 $this->dumpForm( $file, $chunkLength, 1 ); 00079 fclose( $file ); 00080 } 00081 00082 private function dumpForm( $file, $length, $indent ) { 00083 $start = ftell( $file ); 00084 $secondary = fread( $file, 4 ); 00085 echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n"; 00086 while( ftell( $file ) - $start < $length ) { 00087 $chunkHeader = fread( $file, 8 ); 00088 if( $chunkHeader == '' ) { 00089 break; 00090 } 00091 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00092 extract( unpack( 'a4chunk/NchunkLength', $chunkHeader ) ); 00093 echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n"; 00094 00095 if( $chunk == 'FORM' ) { 00096 $this->dumpForm( $file, $chunkLength, $indent + 1 ); 00097 } else { 00098 fseek( $file, $chunkLength, SEEK_CUR ); 00099 if( $chunkLength & 1 == 1 ) { 00100 // Padding byte between chunks 00101 fseek( $file, 1, SEEK_CUR ); 00102 } 00103 } 00104 } 00105 } 00106 00107 function getInfo() { 00108 wfSuppressWarnings(); 00109 $file = fopen( $this->mFilename, 'rb' ); 00110 wfRestoreWarnings(); 00111 if( $file === false ) { 00112 wfDebug( __METHOD__ . ": missing or failed file read\n" ); 00113 return false; 00114 } 00115 00116 $header = fread( $file, 16 ); 00117 $info = false; 00118 00119 if( strlen( $header ) < 16 ) { 00120 wfDebug( __METHOD__ . ": too short file header\n" ); 00121 } else { 00122 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00123 extract( unpack( 'a4magic/a4form/NformLength/a4subtype', $header ) ); 00124 00125 if( $magic != 'AT&T' ) { 00126 wfDebug( __METHOD__ . ": not a DjVu file\n" ); 00127 } elseif( $subtype == 'DJVU' ) { 00128 // Single-page document 00129 $info = $this->getPageInfo( $file, $formLength ); 00130 } elseif( $subtype == 'DJVM' ) { 00131 // Multi-page document 00132 $info = $this->getMultiPageInfo( $file, $formLength ); 00133 } else { 00134 wfDebug( __METHOD__ . ": unrecognized DJVU file type '$formType'\n" ); 00135 } 00136 } 00137 fclose( $file ); 00138 return $info; 00139 } 00140 00141 private function readChunk( $file ) { 00142 $header = fread( $file, 8 ); 00143 if( strlen( $header ) < 8 ) { 00144 return array( false, 0 ); 00145 } else { 00146 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00147 extract( unpack( 'a4chunk/Nlength', $header ) ); 00148 return array( $chunk, $length ); 00149 } 00150 } 00151 00152 private function skipChunk( $file, $chunkLength ) { 00153 fseek( $file, $chunkLength, SEEK_CUR ); 00154 00155 if( $chunkLength & 0x01 == 1 && !feof( $file ) ) { 00156 // padding byte 00157 fseek( $file, 1, SEEK_CUR ); 00158 } 00159 } 00160 00161 private function getMultiPageInfo( $file, $formLength ) { 00162 // For now, we'll just look for the first page in the file 00163 // and report its information, hoping others are the same size. 00164 $start = ftell( $file ); 00165 do { 00166 list( $chunk, $length ) = $this->readChunk( $file ); 00167 if( !$chunk ) { 00168 break; 00169 } 00170 00171 if( $chunk == 'FORM' ) { 00172 $subtype = fread( $file, 4 ); 00173 if( $subtype == 'DJVU' ) { 00174 wfDebug( __METHOD__ . ": found first subpage\n" ); 00175 return $this->getPageInfo( $file, $length ); 00176 } 00177 $this->skipChunk( $file, $length - 4 ); 00178 } else { 00179 wfDebug( __METHOD__ . ": skipping '$chunk' chunk\n" ); 00180 $this->skipChunk( $file, $length ); 00181 } 00182 } while( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength ); 00183 00184 wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages\n" ); 00185 return false; 00186 } 00187 00188 private function getPageInfo( $file, $formLength ) { 00189 list( $chunk, $length ) = $this->readChunk( $file ); 00190 if( $chunk != 'INFO' ) { 00191 wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'\n" ); 00192 return false; 00193 } 00194 00195 if( $length < 9 ) { 00196 wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length\n" ); 00197 return false; 00198 } 00199 $data = fread( $file, $length ); 00200 if( strlen( $data ) < $length ) { 00201 wfDebug( __METHOD__ . ": INFO chunk cut off\n" ); 00202 return false; 00203 } 00204 00205 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00206 extract( unpack( 00207 'nwidth/' . 00208 'nheight/' . 00209 'Cminor/' . 00210 'Cmajor/' . 00211 'vresolution/' . 00212 'Cgamma', $data ) ); 00213 # Newer files have rotation info in byte 10, but we don't use it yet. 00214 00215 return array( 00216 'width' => $width, 00217 'height' => $height, 00218 'version' => "$major.$minor", 00219 'resolution' => $resolution, 00220 'gamma' => $gamma / 10.0 ); 00221 } 00222 00227 function retrieveMetaData() { 00228 global $wgDjvuToXML, $wgDjvuDump, $wgDjvuTxt; 00229 wfProfileIn( __METHOD__ ); 00230 00231 if ( isset( $wgDjvuDump ) ) { 00232 # djvudump is faster as of version 3.5 00233 # http://sourceforge.net/tracker/index.php?func=detail&aid=1704049&group_id=32953&atid=406583 00234 wfProfileIn( 'djvudump' ); 00235 $cmd = wfEscapeShellArg( $wgDjvuDump ) . ' ' . wfEscapeShellArg( $this->mFilename ); 00236 $dump = wfShellExec( $cmd ); 00237 $xml = $this->convertDumpToXML( $dump ); 00238 wfProfileOut( 'djvudump' ); 00239 } elseif ( isset( $wgDjvuToXML ) ) { 00240 wfProfileIn( 'djvutoxml' ); 00241 $cmd = wfEscapeShellArg( $wgDjvuToXML ) . ' --without-anno --without-text ' . 00242 wfEscapeShellArg( $this->mFilename ); 00243 $xml = wfShellExec( $cmd ); 00244 wfProfileOut( 'djvutoxml' ); 00245 } else { 00246 $xml = null; 00247 } 00248 # Text layer 00249 if ( isset( $wgDjvuTxt ) ) { 00250 wfProfileIn( 'djvutxt' ); 00251 $cmd = wfEscapeShellArg( $wgDjvuTxt ) . ' --detail=page ' . wfEscapeShellArg( $this->mFilename ) ; 00252 wfDebug( __METHOD__.": $cmd\n" ); 00253 $retval = ''; 00254 $txt = wfShellExec( $cmd, $retval ); 00255 wfProfileOut( 'djvutxt' ); 00256 if( $retval == 0) { 00257 # Strip some control characters 00258 $txt = preg_replace( "/[\013\035\037]/", "", $txt ); 00259 $reg = <<<EOR 00260 /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*" 00261 ((?> # Text to match is composed of atoms of either: 00262 \\\\. # - any escaped character 00263 | # - any character different from " and \ 00264 [^"\\\\]+ 00265 )*?) 00266 "\s*\) 00267 | # Or page can be empty ; in this case, djvutxt dumps () 00268 \(\s*()\)/sx 00269 EOR; 00270 $txt = preg_replace_callback( $reg, array( $this, 'pageTextCallback' ), $txt ); 00271 $txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n"; 00272 $xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml, 1 ); 00273 $xml = $xml . $txt. '</mw-djvu>' ; 00274 } 00275 } 00276 wfProfileOut( __METHOD__ ); 00277 return $xml; 00278 } 00279 00280 function pageTextCallback( $matches ) { 00281 # Get rid of invalid UTF-8, strip control characters 00282 return '<PAGE value="' . htmlspecialchars( UtfNormal::cleanUp( $matches[1] ) ) . '" />'; 00283 } 00284 00288 function convertDumpToXML( $dump ) { 00289 if ( strval( $dump ) == '' ) { 00290 return false; 00291 } 00292 00293 $xml = <<<EOT 00294 <?xml version="1.0" ?> 00295 <!DOCTYPE DjVuXML PUBLIC "-//W3C//DTD DjVuXML 1.1//EN" "pubtext/DjVuXML-s.dtd"> 00296 <DjVuXML> 00297 <HEAD></HEAD> 00298 <BODY> 00299 EOT; 00300 00301 $dump = str_replace( "\r", '', $dump ); 00302 $line = strtok( $dump, "\n" ); 00303 $m = false; 00304 $good = false; 00305 if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) { 00306 # Single-page 00307 if ( $this->parseFormDjvu( $line, $xml ) ) { 00308 $good = true; 00309 } else { 00310 return false; 00311 } 00312 } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) { 00313 # Multi-page 00314 $parentLevel = strlen( $m[1] ); 00315 # Find DIRM 00316 $line = strtok( "\n" ); 00317 while ( $line !== false ) { 00318 $childLevel = strspn( $line, ' ' ); 00319 if ( $childLevel <= $parentLevel ) { 00320 # End of chunk 00321 break; 00322 } 00323 00324 if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) { 00325 wfDebug( "Indirect multi-page DjVu document, bad for server!\n" ); 00326 return false; 00327 } 00328 if ( preg_match( '/^ *FORM:DJVU/', $line ) ) { 00329 # Found page 00330 if ( $this->parseFormDjvu( $line, $xml ) ) { 00331 $good = true; 00332 } else { 00333 return false; 00334 } 00335 } 00336 $line = strtok( "\n" ); 00337 } 00338 } 00339 if ( !$good ) { 00340 return false; 00341 } 00342 00343 $xml .= "</BODY>\n</DjVuXML>\n"; 00344 return $xml; 00345 } 00346 00347 function parseFormDjvu( $line, &$xml ) { 00348 $parentLevel = strspn( $line, ' ' ); 00349 $line = strtok( "\n" ); 00350 00351 # Find INFO 00352 while ( $line !== false ) { 00353 $childLevel = strspn( $line, ' ' ); 00354 if ( $childLevel <= $parentLevel ) { 00355 # End of chunk 00356 break; 00357 } 00358 00359 if ( preg_match( '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/', $line, $m ) ) { 00360 $xml .= Xml::tags( 'OBJECT', 00361 array( 00362 #'data' => '', 00363 #'type' => 'image/x.djvu', 00364 'height' => $m[2], 00365 'width' => $m[1], 00366 #'usemap' => '', 00367 ), 00368 "\n" . 00369 Xml::element( 'PARAM', array( 'name' => 'DPI', 'value' => $m[3] ) ) . "\n" . 00370 Xml::element( 'PARAM', array( 'name' => 'GAMMA', 'value' => $m[4] ) ) . "\n" 00371 ) . "\n"; 00372 return true; 00373 } 00374 $line = strtok( "\n" ); 00375 } 00376 # Not found 00377 return false; 00378 } 00379 }