examples/PIPS/antiword/src/wordlib.c

00001 /*
00002  * wordlib.c
00003  * Copyright (C) 1998-2004 A.J. van Os; Released under GNU GPL
00004  *
00005  * Description:
00006  * Deal with the internals of a MS Word file
00007  */
00008 
00009 #include "antiword.h"
00010 
00011 static BOOL     bOldMacFile = FALSE;
00012 
00013 
00014 /*
00015  * Common part of the file checking functions
00016  */
00017 static BOOL
00018 bCheckBytes(FILE *pFile, const UCHAR *aucBytes, size_t tBytes)
00019 {
00020         int     iIndex, iChar;
00021 
00022         fail(pFile == NULL || aucBytes == NULL || tBytes == 0);
00023 
00024         rewind(pFile);
00025 
00026         for (iIndex = 0; iIndex < (int)tBytes; iIndex++) {
00027                 iChar = getc(pFile);
00028                 if (iChar == EOF || iChar != (int)aucBytes[iIndex]) {
00029                         NO_DBG_HEX(iChar);
00030                         NO_DBG_HEX(aucBytes[iIndex]);
00031                         return FALSE;
00032                 }
00033         }
00034         return TRUE;
00035 } /* end of bCheckBytes */
00036 
00037 /*
00038  * This function checks whether the given file is or is not a "Word for DOS"
00039  * document
00040  */
00041 BOOL
00042 bIsWordForDosFile(FILE *pFile, long lFilesize)
00043 {
00044         static UCHAR    aucBytes[] =
00045                 { 0x31, 0xbe, 0x00, 0x00, 0x00, 0xab }; /* Word for DOS */
00046 
00047         DBG_MSG("bIsWordForDosFile");
00048 
00049         if (pFile == NULL || lFilesize < 0) {
00050                 DBG_MSG("No proper file given");
00051                 return FALSE;
00052         }
00053         if (lFilesize < 128) {
00054                 DBG_MSG("File too small to be a Word document");
00055                 return FALSE;
00056         }
00057         return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
00058 } /* end of bIsWordForDosFile */
00059 
00060 /*
00061  * This function checks whether the given file is or is not a file with an
00062  * OLE envelope (That is a document made by Word 6 or later)
00063  */
00064 static BOOL
00065 bIsWordFileWithOLE(FILE *pFile, long lFilesize)
00066 {
00067         static UCHAR    aucBytes[] =
00068                 { 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 };
00069         int     iTailLen;
00070 
00071         if (pFile == NULL || lFilesize < 0) {
00072                 DBG_MSG("No proper file given");
00073                 return FALSE;
00074         }
00075         if (lFilesize < (long)BIG_BLOCK_SIZE * 3) {
00076                 DBG_MSG("This file is too small to be a Word document");
00077                 return FALSE;
00078         }
00079 
00080         iTailLen = (int)(lFilesize % BIG_BLOCK_SIZE);
00081         switch (iTailLen) {
00082         case 0:         /* No tail, as it should be */
00083                 break;
00084         case 1:
00085         case 2:         /* Filesize mismatch or a buggy email program */
00086                 if ((int)(lFilesize % 3) == iTailLen) {
00087                         DBG_DEC(lFilesize);
00088                         return FALSE;
00089                 }
00090                 /*
00091                  * Ignore extra bytes caused by buggy email programs.
00092                  * They have bugs in their base64 encoding or decoding.
00093                  * 3 bytes -> 4 ascii chars -> 3 bytes
00094                  */
00095                 DBG_MSG("Document with extra bytes");
00096                 break;
00097         default:        /* Wrong filesize for a Word document */
00098                 DBG_DEC(lFilesize);
00099                 DBG_DEC(iTailLen);
00100                 return FALSE;
00101         }
00102         return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
00103 } /* end of bIsWordFileWithOLE */
00104 
00105 /*
00106  * This function checks whether the given file is or is not a RTF document
00107  */
00108 BOOL
00109 bIsRtfFile(FILE *pFile)
00110 {
00111         static UCHAR    aucBytes[] =
00112                 { '{', '\\', 'r', 't', 'f', '1' };
00113 
00114         DBG_MSG("bIsRtfFile");
00115 
00116         return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
00117 } /* end of bIsRtfFile */
00118 
00119 /*
00120  * This function checks whether the given file is or is not a WP document
00121  */
00122 BOOL
00123 bIsWordPerfectFile(FILE *pFile)
00124 {
00125         static UCHAR    aucBytes[] =
00126                 { 0xff, 'W', 'P', 'C' };
00127 
00128         DBG_MSG("bIsWordPerfectFile");
00129 
00130         return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
00131 } /* end of bIsWordPerfectFile */
00132 
00133 /*
00134  * This function checks whether the given file is or is not a "Win Word 1 or 2"
00135  * document
00136  */
00137 BOOL
00138 bIsWinWord12File(FILE *pFile, long lFilesize)
00139 {
00140         static UCHAR    aucBytes[2][4] = {
00141                 { 0x9b, 0xa5, 0x21, 0x00 },     /* Win Word 1.x */
00142                 { 0xdb, 0xa5, 0x2d, 0x00 },     /* Win Word 2.0 */
00143         };
00144         int     iIndex;
00145 
00146         DBG_MSG("bIsWinWord12File");
00147 
00148         if (pFile == NULL || lFilesize < 0) {
00149                 DBG_MSG("No proper file given");
00150                 return FALSE;
00151         }
00152         if (lFilesize < 384) {
00153                 DBG_MSG("This file is too small to be a Word document");
00154                 return FALSE;
00155         }
00156 
00157         for (iIndex = 0; iIndex < (int)elementsof(aucBytes); iIndex++) {
00158                 if (bCheckBytes(pFile,
00159                                 aucBytes[iIndex],
00160                                 elementsof(aucBytes[iIndex]))) {
00161                         return TRUE;
00162                 }
00163         }
00164         return FALSE;
00165 } /* end of bIsWinWord12File */
00166 
00167 /*
00168  * This function checks whether the given file is or is not a "Mac Word 4 or 5"
00169  * document
00170  */
00171 BOOL
00172 bIsMacWord45File(FILE *pFile)
00173 {
00174         static UCHAR    aucBytes[2][6] = {
00175                 { 0xfe, 0x37, 0x00, 0x1c, 0x00, 0x00 }, /* Mac Word 4 */
00176                 { 0xfe, 0x37, 0x00, 0x23, 0x00, 0x00 }, /* Mac Word 5 */
00177         };
00178         int     iIndex;
00179 
00180         DBG_MSG("bIsMacWord45File");
00181 
00182         for (iIndex = 0; iIndex < (int)elementsof(aucBytes); iIndex++) {
00183                 if (bCheckBytes(pFile,
00184                                 aucBytes[iIndex],
00185                                 elementsof(aucBytes[iIndex]))) {
00186                         return TRUE;
00187                 }
00188         }
00189         return FALSE;
00190 } /* end of bIsMacWord45File */
00191 
00192 /*
00193  * iGuessVersionNumber - guess the Word version number from first few bytes
00194  *
00195  * Returns the guessed version number or -1 when no guess it possible
00196  */
00197 int
00198 iGuessVersionNumber(FILE *pFile, long lFilesize)
00199 {
00200         if(bIsWordForDosFile(pFile, lFilesize)) {
00201                 return 0;
00202         }
00203         if (bIsWinWord12File(pFile, lFilesize)) {
00204                 return 2;
00205         }
00206         if (bIsMacWord45File(pFile)) {
00207                 return 5;
00208         }
00209         if (bIsWordFileWithOLE(pFile, lFilesize)) {
00210                 return 6;
00211         }
00212         return -1;
00213 } /* end of iGuessVersionNumber */
00214 
00215 /*
00216  * iGetVersionNumber - get the Word version number from the header
00217  *
00218  * Returns the version number or -1 when unknown
00219  */
00220 int
00221 iGetVersionNumber(const UCHAR *aucHeader)
00222 {
00223         USHORT  usFib, usChse;
00224 
00225         usFib = usGetWord(0x02, aucHeader);
00226         if (usFib >= 0x1000) {
00227                 /* To big: must be MacWord using Big Endian */
00228                 DBG_HEX(usFib);
00229                 usFib = usGetWordBE(0x02, aucHeader);
00230         }
00231         DBG_DEC(usFib);
00232         bOldMacFile = FALSE;
00233         switch (usFib) {
00234         case   0:
00235                 DBG_MSG("Word for DOS");
00236                 return 0;
00237         case  28:
00238                 DBG_MSG("Word 4 for Macintosh");
00239                 bOldMacFile = TRUE;
00240                 return 4;
00241         case  33:
00242                 DBG_MSG("Word 1.x for Windows");
00243                 return 1;
00244         case  35:
00245                 DBG_MSG("Word 5 for Macintosh");
00246                 bOldMacFile = TRUE;
00247                 return 5;
00248         case  45:
00249                 DBG_MSG("Word 2 for Windows");
00250                 return 2;
00251         case 101:
00252         case 102:
00253                 DBG_MSG("Word 6 for Windows");
00254                 return 6;
00255         case 103:
00256         case 104:
00257                 usChse = usGetWord(0x14, aucHeader);
00258                 DBG_DEC(usChse);
00259                 switch (usChse) {
00260                 case 0:
00261                         DBG_MSG("Word 7 for Win95");
00262                         return 7;
00263                 case 256:
00264                         DBG_MSG("Word 6 for Macintosh");
00265                         bOldMacFile = TRUE;
00266                         return 6;
00267                 default:
00268                         DBG_FIXME();
00269                         if ((int)ucGetByte(0x05, aucHeader) == 0xe0) {
00270                                 DBG_MSG("Word 7 for Win95");
00271                                 return 7;
00272                         }
00273                         DBG_MSG("Word 6 for Macintosh");
00274                         bOldMacFile = TRUE;
00275                         return 6;
00276                 }
00277         default:
00278                 usChse = usGetWord(0x14, aucHeader);
00279                 DBG_DEC(usChse);
00280                 if (usFib < 192) {
00281                         /* Unknown or unsupported version of Word */
00282                         DBG_DEC(usFib);
00283                         return -1;
00284                 }
00285                 DBG_MSG_C(usChse != 256, "Word97 for Win95/98/NT");
00286                 DBG_MSG_C(usChse == 256, "Word98 for Macintosh");
00287                 return 8;
00288         }
00289 } /* end of iGetVersionNumber */
00290 
00291 /*
00292  * TRUE if the current file was made by Word version 6 or older on an
00293  * Apple Macintosh, otherwise FALSE.
00294  * This function hides the methode of how to find out from the rest of the
00295  * program.
00296  */
00297 BOOL
00298 bIsOldMacFile(void)
00299 {
00300         return bOldMacFile;
00301 } /* end of bIsOldMacFile */
00302 
00303 /*
00304  * iInitDocument - initialize a document
00305  *
00306  * Returns the version of Word that made the document or -1
00307  */
00308 int
00309 iInitDocument(FILE *pFile, long lFilesize)
00310 {
00311         int     iGuess, iWordVersion;
00312 
00313         iGuess = iGuessVersionNumber(pFile, lFilesize);
00314         switch (iGuess) {
00315         case 0:
00316                 iWordVersion = iInitDocumentDOS(pFile, lFilesize);
00317                 break;
00318         case 2:
00319                 iWordVersion = iInitDocumentWIN(pFile, lFilesize);
00320                 break;
00321         case 5:
00322                 iWordVersion = iInitDocumentMAC(pFile, lFilesize);
00323                 break;
00324         case 6:
00325                 iWordVersion = iInitDocumentOLE(pFile, lFilesize);
00326                 break;
00327         default:
00328                 DBG_DEC(iGuess);
00329                 iWordVersion = -1;
00330                 break;
00331         }
00332         return iWordVersion;
00333 } /* end of iInitDocument */
00334 
00335 /*
00336  * vFreeDocument - free a document by free-ing its parts
00337  */
00338 void
00339 vFreeDocument(void)
00340 {
00341         DBG_MSG("vFreeDocument");
00342 
00343         /* Free the memory */
00344         vDestroyTextBlockList();
00345         vDestroyDataBlockList();
00346         vDestroyListInfoList();
00347         vDestroyRowInfoList();
00348         vDestroyStyleInfoList();
00349         vDestroyFontInfoList();
00350         vDestroyStylesheetList();
00351         vDestroyPictInfoList();
00352         vDestroyDocumentInfoList();
00353         vDestroySectionInfoList();
00354         vDestroyHdrFtrInfoList();
00355         vDestroyPropModList();
00356         vDestroyNotesInfoLists();
00357         vDestroyFontTable();
00358         vDestroySummaryInfo();
00359 } /* end of vFreeDocument */

Generated by  doxygen 1.6.2