MediaWiki
REL1_20
|
00001 <?php 00036 class DjVuImage { 00037 function __construct( $filename ) { 00038 $this->mFilename = $filename; 00039 } 00040 00045 public function isValid() { 00046 $info = $this->getInfo(); 00047 return $info !== false; 00048 } 00049 00050 00055 public function getImageSize() { 00056 $data = $this->getInfo(); 00057 00058 if( $data !== false ) { 00059 $width = $data['width']; 00060 $height = $data['height']; 00061 00062 return array( $width, $height, 'DjVu', 00063 "width=\"$width\" height=\"$height\"" ); 00064 } 00065 return false; 00066 } 00067 00068 // --------- 00069 00073 function dump() { 00074 $file = fopen( $this->mFilename, 'rb' ); 00075 $header = fread( $file, 12 ); 00076 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00077 extract( unpack( 'a4magic/a4chunk/NchunkLength', $header ) ); 00078 echo "$chunk $chunkLength\n"; 00079 $this->dumpForm( $file, $chunkLength, 1 ); 00080 fclose( $file ); 00081 } 00082 00083 private function dumpForm( $file, $length, $indent ) { 00084 $start = ftell( $file ); 00085 $secondary = fread( $file, 4 ); 00086 echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n"; 00087 while( ftell( $file ) - $start < $length ) { 00088 $chunkHeader = fread( $file, 8 ); 00089 if( $chunkHeader == '' ) { 00090 break; 00091 } 00092 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00093 extract( unpack( 'a4chunk/NchunkLength', $chunkHeader ) ); 00094 echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n"; 00095 00096 if( $chunk == 'FORM' ) { 00097 $this->dumpForm( $file, $chunkLength, $indent + 1 ); 00098 } else { 00099 fseek( $file, $chunkLength, SEEK_CUR ); 00100 if( $chunkLength & 1 == 1 ) { 00101 // Padding byte between chunks 00102 fseek( $file, 1, SEEK_CUR ); 00103 } 00104 } 00105 } 00106 } 00107 00108 function getInfo() { 00109 wfSuppressWarnings(); 00110 $file = fopen( $this->mFilename, 'rb' ); 00111 wfRestoreWarnings(); 00112 if( $file === false ) { 00113 wfDebug( __METHOD__ . ": missing or failed file read\n" ); 00114 return false; 00115 } 00116 00117 $header = fread( $file, 16 ); 00118 $info = false; 00119 00120 if( strlen( $header ) < 16 ) { 00121 wfDebug( __METHOD__ . ": too short file header\n" ); 00122 } else { 00123 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00124 extract( unpack( 'a4magic/a4form/NformLength/a4subtype', $header ) ); 00125 00126 if( $magic != 'AT&T' ) { 00127 wfDebug( __METHOD__ . ": not a DjVu file\n" ); 00128 } elseif( $subtype == 'DJVU' ) { 00129 // Single-page document 00130 $info = $this->getPageInfo( $file, $formLength ); 00131 } elseif( $subtype == 'DJVM' ) { 00132 // Multi-page document 00133 $info = $this->getMultiPageInfo( $file, $formLength ); 00134 } else { 00135 wfDebug( __METHOD__ . ": unrecognized DJVU file type '$formType'\n" ); 00136 } 00137 } 00138 fclose( $file ); 00139 return $info; 00140 } 00141 00142 private function readChunk( $file ) { 00143 $header = fread( $file, 8 ); 00144 if( strlen( $header ) < 8 ) { 00145 return array( false, 0 ); 00146 } else { 00147 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00148 extract( unpack( 'a4chunk/Nlength', $header ) ); 00149 return array( $chunk, $length ); 00150 } 00151 } 00152 00153 private function skipChunk( $file, $chunkLength ) { 00154 fseek( $file, $chunkLength, SEEK_CUR ); 00155 00156 if( $chunkLength & 0x01 == 1 && !feof( $file ) ) { 00157 // padding byte 00158 fseek( $file, 1, SEEK_CUR ); 00159 } 00160 } 00161 00162 private function getMultiPageInfo( $file, $formLength ) { 00163 // For now, we'll just look for the first page in the file 00164 // and report its information, hoping others are the same size. 00165 $start = ftell( $file ); 00166 do { 00167 list( $chunk, $length ) = $this->readChunk( $file ); 00168 if( !$chunk ) { 00169 break; 00170 } 00171 00172 if( $chunk == 'FORM' ) { 00173 $subtype = fread( $file, 4 ); 00174 if( $subtype == 'DJVU' ) { 00175 wfDebug( __METHOD__ . ": found first subpage\n" ); 00176 return $this->getPageInfo( $file, $length ); 00177 } 00178 $this->skipChunk( $file, $length - 4 ); 00179 } else { 00180 wfDebug( __METHOD__ . ": skipping '$chunk' chunk\n" ); 00181 $this->skipChunk( $file, $length ); 00182 } 00183 } while( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength ); 00184 00185 wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages\n" ); 00186 return false; 00187 } 00188 00189 private function getPageInfo( $file, $formLength ) { 00190 list( $chunk, $length ) = $this->readChunk( $file ); 00191 if( $chunk != 'INFO' ) { 00192 wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'\n" ); 00193 return false; 00194 } 00195 00196 if( $length < 9 ) { 00197 wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length\n" ); 00198 return false; 00199 } 00200 $data = fread( $file, $length ); 00201 if( strlen( $data ) < $length ) { 00202 wfDebug( __METHOD__ . ": INFO chunk cut off\n" ); 00203 return false; 00204 } 00205 00206 // @todo FIXME: Would be good to replace this extract() call with something that explicitly initializes local variables. 00207 extract( unpack( 00208 'nwidth/' . 00209 'nheight/' . 00210 'Cminor/' . 00211 'Cmajor/' . 00212 'vresolution/' . 00213 'Cgamma', $data ) ); 00214 # Newer files have rotation info in byte 10, but we don't use it yet. 00215 00216 return array( 00217 'width' => $width, 00218 'height' => $height, 00219 'version' => "$major.$minor", 00220 'resolution' => $resolution, 00221 'gamma' => $gamma / 10.0 ); 00222 } 00223 00228 function retrieveMetaData() { 00229 global $wgDjvuToXML, $wgDjvuDump, $wgDjvuTxt; 00230 wfProfileIn( __METHOD__ ); 00231 00232 if ( isset( $wgDjvuDump ) ) { 00233 # djvudump is faster as of version 3.5 00234 # http://sourceforge.net/tracker/index.php?func=detail&aid=1704049&group_id=32953&atid=406583 00235 wfProfileIn( 'djvudump' ); 00236 $cmd = wfEscapeShellArg( $wgDjvuDump ) . ' ' . wfEscapeShellArg( $this->mFilename ); 00237 $dump = wfShellExec( $cmd ); 00238 $xml = $this->convertDumpToXML( $dump ); 00239 wfProfileOut( 'djvudump' ); 00240 } elseif ( isset( $wgDjvuToXML ) ) { 00241 wfProfileIn( 'djvutoxml' ); 00242 $cmd = wfEscapeShellArg( $wgDjvuToXML ) . ' --without-anno --without-text ' . 00243 wfEscapeShellArg( $this->mFilename ); 00244 $xml = wfShellExec( $cmd ); 00245 wfProfileOut( 'djvutoxml' ); 00246 } else { 00247 $xml = null; 00248 } 00249 # Text layer 00250 if ( isset( $wgDjvuTxt ) ) { 00251 wfProfileIn( 'djvutxt' ); 00252 $cmd = wfEscapeShellArg( $wgDjvuTxt ) . ' --detail=page ' . wfEscapeShellArg( $this->mFilename ) ; 00253 wfDebug( __METHOD__.": $cmd\n" ); 00254 $retval = ''; 00255 $txt = wfShellExec( $cmd, $retval ); 00256 wfProfileOut( 'djvutxt' ); 00257 if( $retval == 0) { 00258 # Strip some control characters 00259 $txt = preg_replace( "/[\013\035\037]/", "", $txt ); 00260 $reg = <<<EOR 00261 /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*" 00262 ((?> # Text to match is composed of atoms of either: 00263 \\\\. # - any escaped character 00264 | # - any character different from " and \ 00265 [^"\\\\]+ 00266 )*?) 00267 "\s*\) 00268 | # Or page can be empty ; in this case, djvutxt dumps () 00269 \(\s*()\)/sx 00270 EOR; 00271 $txt = preg_replace_callback( $reg, array( $this, 'pageTextCallback' ), $txt ); 00272 $txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n"; 00273 $xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml, 1 ); 00274 $xml = $xml . $txt. '</mw-djvu>' ; 00275 } 00276 } 00277 wfProfileOut( __METHOD__ ); 00278 return $xml; 00279 } 00280 00281 function pageTextCallback( $matches ) { 00282 # Get rid of invalid UTF-8, strip control characters 00283 return '<PAGE value="' . htmlspecialchars( UtfNormal::cleanUp( $matches[1] ) ) . '" />'; 00284 } 00285 00290 function convertDumpToXML( $dump ) { 00291 if ( strval( $dump ) == '' ) { 00292 return false; 00293 } 00294 00295 $xml = <<<EOT 00296 <?xml version="1.0" ?> 00297 <!DOCTYPE DjVuXML PUBLIC "-//W3C//DTD DjVuXML 1.1//EN" "pubtext/DjVuXML-s.dtd"> 00298 <DjVuXML> 00299 <HEAD></HEAD> 00300 <BODY> 00301 EOT; 00302 00303 $dump = str_replace( "\r", '', $dump ); 00304 $line = strtok( $dump, "\n" ); 00305 $m = false; 00306 $good = false; 00307 if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) { 00308 # Single-page 00309 if ( $this->parseFormDjvu( $line, $xml ) ) { 00310 $good = true; 00311 } else { 00312 return false; 00313 } 00314 } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) { 00315 # Multi-page 00316 $parentLevel = strlen( $m[1] ); 00317 # Find DIRM 00318 $line = strtok( "\n" ); 00319 while ( $line !== false ) { 00320 $childLevel = strspn( $line, ' ' ); 00321 if ( $childLevel <= $parentLevel ) { 00322 # End of chunk 00323 break; 00324 } 00325 00326 if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) { 00327 wfDebug( "Indirect multi-page DjVu document, bad for server!\n" ); 00328 return false; 00329 } 00330 if ( preg_match( '/^ *FORM:DJVU/', $line ) ) { 00331 # Found page 00332 if ( $this->parseFormDjvu( $line, $xml ) ) { 00333 $good = true; 00334 } else { 00335 return false; 00336 } 00337 } 00338 $line = strtok( "\n" ); 00339 } 00340 } 00341 if ( !$good ) { 00342 return false; 00343 } 00344 00345 $xml .= "</BODY>\n</DjVuXML>\n"; 00346 return $xml; 00347 } 00348 00349 function parseFormDjvu( $line, &$xml ) { 00350 $parentLevel = strspn( $line, ' ' ); 00351 $line = strtok( "\n" ); 00352 00353 # Find INFO 00354 while ( $line !== false ) { 00355 $childLevel = strspn( $line, ' ' ); 00356 if ( $childLevel <= $parentLevel ) { 00357 # End of chunk 00358 break; 00359 } 00360 00361 if ( preg_match( '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/', $line, $m ) ) { 00362 $xml .= Xml::tags( 'OBJECT', 00363 array( 00364 #'data' => '', 00365 #'type' => 'image/x.djvu', 00366 'height' => $m[2], 00367 'width' => $m[1], 00368 #'usemap' => '', 00369 ), 00370 "\n" . 00371 Xml::element( 'PARAM', array( 'name' => 'DPI', 'value' => $m[3] ) ) . "\n" . 00372 Xml::element( 'PARAM', array( 'name' => 'GAMMA', 'value' => $m[4] ) ) . "\n" 00373 ) . "\n"; 00374 return true; 00375 } 00376 $line = strtok( "\n" ); 00377 } 00378 # Not found 00379 return false; 00380 } 00381 }