examples/PIPS/antiword/src/findtext.c

00001 /*
00002  * findtext.c
00003  * Copyright (C) 1998-2004 A.J. van Os; Released under GNU GPL
00004  *
00005  * Description:
00006  * Find the blocks that contain the text of MS Word files
00007  */
00008 
00009 #include <stdio.h>
00010 #include <stdlib.h>
00011 #include "antiword.h"
00012 
00013 
00014 /*
00015  * bAddTextBlocks - Add the blocks to the text block list
00016  *
00017  * Returns TRUE when successful, FALSE if not
00018  */
00019 BOOL
00020 bAddTextBlocks(ULONG ulCharPosFirst, ULONG ulTotalLength,
00021         BOOL bUsesUnicode, USHORT usPropMod,
00022         ULONG ulStartBlock, const ULONG *aulBBD, size_t tBBDLen)
00023 {
00024         text_block_type tTextBlock;
00025         ULONG   ulCharPos, ulOffset, ulIndex;
00026         long    lToGo;
00027 
00028         fail(ulTotalLength > (ULONG)LONG_MAX / 2);
00029         fail(ulStartBlock > MAX_BLOCKNUMBER && ulStartBlock != END_OF_CHAIN);
00030         fail(aulBBD == NULL);
00031 
00032         NO_DBG_HEX(ulCharPosFirst);
00033         NO_DBG_DEC(ulTotalLength);
00034 
00035         if (bUsesUnicode) {
00036                 /* One character equals two bytes */
00037                 NO_DBG_MSG("Uses Unicode");
00038                 lToGo = (long)ulTotalLength * 2;
00039         } else {
00040                 /* One character equals one byte */
00041                 NO_DBG_MSG("Uses ASCII");
00042                 lToGo = (long)ulTotalLength;
00043         }
00044 
00045         ulCharPos = ulCharPosFirst;
00046         ulOffset = ulCharPosFirst;
00047         for (ulIndex = ulStartBlock;
00048              ulIndex != END_OF_CHAIN && lToGo > 0;
00049              ulIndex = aulBBD[ulIndex]) {
00050                 if (ulIndex >= (ULONG)tBBDLen) {
00051                         DBG_DEC(ulIndex);
00052                         DBG_DEC(tBBDLen);
00053                         werr(1, "The Big Block Depot is damaged");
00054                 }
00055                 if (ulOffset >= BIG_BLOCK_SIZE) {
00056                         ulOffset -= BIG_BLOCK_SIZE;
00057                         continue;
00058                 }
00059                 tTextBlock.ulFileOffset =
00060                         (ulIndex + 1) * BIG_BLOCK_SIZE + ulOffset;
00061                 tTextBlock.ulCharPos = ulCharPos;
00062                 tTextBlock.ulLength = min(BIG_BLOCK_SIZE - ulOffset,
00063                                                 (ULONG)lToGo);
00064                 tTextBlock.bUsesUnicode = bUsesUnicode;
00065                 tTextBlock.usPropMod = usPropMod;
00066                 ulOffset = 0;
00067                 if (!bAdd2TextBlockList(&tTextBlock)) {
00068                         DBG_HEX(tTextBlock.ulFileOffset);
00069                         DBG_HEX(tTextBlock.ulCharPos);
00070                         DBG_DEC(tTextBlock.ulLength);
00071                         DBG_DEC(tTextBlock.bUsesUnicode);
00072                         DBG_DEC(tTextBlock.usPropMod);
00073                         return FALSE;
00074                 }
00075                 ulCharPos += tTextBlock.ulLength;
00076                 lToGo -= (long)tTextBlock.ulLength;
00077         }
00078         DBG_DEC_C(lToGo != 0, lToGo);
00079         return lToGo == 0;
00080 } /* end of bAddTextBlocks */
00081 
00082 /*
00083  * bGet6DocumentText - make a list of the text blocks of Word 6/7 files
00084  *
00085  * Code for "fast saved" files.
00086  *
00087  * Returns TRUE when successful, FALSE if not
00088  */
00089 BOOL
00090 bGet6DocumentText(FILE *pFile, BOOL bUsesUnicode, ULONG ulStartBlock,
00091         const ULONG *aulBBD, size_t tBBDLen, const UCHAR *aucHeader)
00092 {
00093         UCHAR   *aucBuffer;
00094         ULONG   ulBeginTextInfo, ulTextOffset, ulTotLength;
00095         size_t  tTextInfoLen;
00096         int     iIndex, iType, iOff, iLen, iPieces;
00097         USHORT  usPropMod;
00098 
00099         DBG_MSG("bGet6DocumentText");
00100 
00101         fail(pFile == NULL);
00102         fail(aulBBD == NULL);
00103         fail(aucHeader == NULL);
00104 
00105         ulBeginTextInfo = ulGetLong(0x160, aucHeader);  /* fcClx */
00106         DBG_HEX(ulBeginTextInfo);
00107         tTextInfoLen = (size_t)ulGetLong(0x164, aucHeader);     /* lcbClx */
00108         DBG_DEC(tTextInfoLen);
00109 
00110         aucBuffer = xmalloc(tTextInfoLen);
00111         if (!bReadBuffer(pFile, ulStartBlock,
00112                         aulBBD, tBBDLen, BIG_BLOCK_SIZE,
00113                         aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
00114                 aucBuffer = xfree(aucBuffer);
00115                 return FALSE;
00116         }
00117         NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
00118 
00119         iOff = 0;
00120         while ((size_t)iOff < tTextInfoLen) {
00121                 iType = (int)ucGetByte(iOff, aucBuffer);
00122                 iOff++;
00123                 if (iType == 0) {
00124                         DBG_FIXME();
00125                         iOff++;
00126                         continue;
00127                 }
00128                 if (iType == 1) {
00129                         iLen = (int)usGetWord(iOff, aucBuffer);
00130                         vAdd2PropModList(aucBuffer + iOff);
00131                         iOff += iLen + 2;
00132                         continue;
00133                 }
00134                 if (iType != 2) {
00135                         werr(0, "Unknown type of 'fastsaved' format");
00136                         aucBuffer = xfree(aucBuffer);
00137                         return FALSE;
00138                 }
00139                 /* Type 2 */
00140                 iLen = (int)usGetWord(iOff, aucBuffer);
00141                 NO_DBG_DEC(iLen);
00142                 iOff += 4;
00143                 iPieces = (iLen - 4) / 12;
00144                 DBG_DEC(iPieces);
00145                 for (iIndex = 0; iIndex < iPieces; iIndex++) {
00146                         ulTextOffset = ulGetLong(
00147                                 iOff + (iPieces + 1) * 4 + iIndex * 8 + 2,
00148                                 aucBuffer);
00149                         usPropMod = usGetWord(
00150                                 iOff + (iPieces + 1) * 4 + iIndex * 8 + 6,
00151                                 aucBuffer);
00152                         ulTotLength = ulGetLong(iOff + (iIndex + 1) * 4,
00153                                                 aucBuffer) -
00154                                         ulGetLong(iOff + iIndex * 4,
00155                                                 aucBuffer);
00156                         NO_DBG_HEX_C(usPropMod != 0, usPropMod);
00157                         if (!bAddTextBlocks(ulTextOffset, ulTotLength,
00158                                         bUsesUnicode, usPropMod,
00159                                         ulStartBlock,
00160                                         aulBBD, tBBDLen)) {
00161                                 aucBuffer = xfree(aucBuffer);
00162                                 return FALSE;
00163                         }
00164                 }
00165                 break;
00166         }
00167         aucBuffer = xfree(aucBuffer);
00168         return TRUE;
00169 } /* end of bGet6DocumentText */
00170 
00171 /*
00172  * bGet8DocumentText - make a list of the text blocks of Word 8/97 files
00173  *
00174  * Returns TRUE when successful, FALSE if not
00175  */
00176 BOOL
00177 bGet8DocumentText(FILE *pFile, const pps_info_type *pPPS,
00178         const ULONG *aulBBD, size_t tBBDLen,
00179         const ULONG *aulSBD, size_t tSBDLen,
00180         const UCHAR *aucHeader)
00181 {
00182         const ULONG     *aulBlockDepot;
00183         UCHAR   *aucBuffer;
00184         ULONG   ulTextOffset, ulBeginTextInfo;
00185         ULONG   ulTotLength, ulLen;
00186         long    lIndex, lPieces, lOff;
00187         size_t  tTextInfoLen, tBlockDepotLen, tBlockSize;
00188         int     iType, iLen;
00189         BOOL    bUsesUnicode;
00190         USHORT  usPropMod;
00191 
00192         DBG_MSG("bGet8DocumentText");
00193 
00194         fail(pFile == NULL || pPPS == NULL);
00195         fail(aulBBD == NULL || aulSBD == NULL);
00196         fail(aucHeader == NULL);
00197 
00198         ulBeginTextInfo = ulGetLong(0x1a2, aucHeader);  /* fcClx */
00199         DBG_HEX(ulBeginTextInfo);
00200         tTextInfoLen = (size_t)ulGetLong(0x1a6, aucHeader);     /* lcbClx */
00201         DBG_DEC(tTextInfoLen);
00202 
00203         DBG_DEC(pPPS->tTable.ulSB);
00204         DBG_HEX(pPPS->tTable.ulSize);
00205         if (pPPS->tTable.ulSize == 0) {
00206                 return FALSE;
00207         }
00208 
00209         if (pPPS->tTable.ulSize < MIN_SIZE_FOR_BBD_USE) {
00210                 /* Use the Small Block Depot */
00211                 aulBlockDepot = aulSBD;
00212                 tBlockDepotLen = tSBDLen;
00213                 tBlockSize = SMALL_BLOCK_SIZE;
00214         } else {
00215                 /* Use the Big Block Depot */
00216                 aulBlockDepot = aulBBD;
00217                 tBlockDepotLen = tBBDLen;
00218                 tBlockSize = BIG_BLOCK_SIZE;
00219         }
00220         aucBuffer = xmalloc(tTextInfoLen);
00221         if (!bReadBuffer(pFile, pPPS->tTable.ulSB,
00222                         aulBlockDepot, tBlockDepotLen, tBlockSize,
00223                         aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
00224                 aucBuffer = xfree(aucBuffer);
00225                 return FALSE;
00226         }
00227         NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
00228 
00229         lOff = 0;
00230         while (lOff < (long)tTextInfoLen) {
00231                 iType = (int)ucGetByte(lOff, aucBuffer);
00232                 lOff++;
00233                 if (iType == 0) {
00234                         DBG_FIXME();
00235                         lOff++;
00236                         continue;
00237                 }
00238                 if (iType == 1) {
00239                         iLen = (int)usGetWord(lOff, aucBuffer);
00240                         vAdd2PropModList(aucBuffer + lOff);
00241                         lOff += (long)iLen + 2;
00242                         continue;
00243                 }
00244                 if (iType != 2) {
00245                         werr(0, "Unknown type of 'fastsaved' format");
00246                         aucBuffer = xfree(aucBuffer);
00247                         return FALSE;
00248                 }
00249                 /* Type 2 */
00250                 ulLen = ulGetLong(lOff, aucBuffer);
00251                 if (ulLen < 4) {
00252                         DBG_DEC(ulLen);
00253                         return FALSE;
00254                 }
00255                 lOff += 4;
00256                 lPieces = (long)((ulLen - 4) / 12);
00257                 DBG_DEC(lPieces);
00258                 for (lIndex = 0; lIndex < lPieces; lIndex++) {
00259                         ulTextOffset = ulGetLong(
00260                                 lOff + (lPieces + 1) * 4 + lIndex * 8 + 2,
00261                                 aucBuffer);
00262                         usPropMod = usGetWord(
00263                                 lOff + (lPieces + 1) * 4 + lIndex * 8 + 6,
00264                                 aucBuffer);
00265                         ulTotLength = ulGetLong(lOff + (lIndex + 1) * 4,
00266                                                 aucBuffer) -
00267                                         ulGetLong(lOff + lIndex * 4,
00268                                                 aucBuffer);
00269                         if ((ulTextOffset & BIT(30)) == 0) {
00270                                 bUsesUnicode = TRUE;
00271                         } else {
00272                                 bUsesUnicode = FALSE;
00273                                 ulTextOffset &= ~BIT(30);
00274                                 ulTextOffset /= 2;
00275                         }
00276                         NO_DBG_HEX_C(usPropMod != 0, usPropMod);
00277                         if (!bAddTextBlocks(ulTextOffset, ulTotLength,
00278                                         bUsesUnicode, usPropMod,
00279                                         pPPS->tWordDocument.ulSB,
00280                                         aulBBD, tBBDLen)) {
00281                                 aucBuffer = xfree(aucBuffer);
00282                                 return FALSE;
00283                         }
00284                 }
00285                 break;
00286         }
00287         aucBuffer = xfree(aucBuffer);
00288         return TRUE;
00289 } /* end of bGet8DocumentText */

Generated by  doxygen 1.6.2