examples/PIPS/antiword/src/chartrans.c

00001 /*
00002  * chartrans.c
00003  * Copyright (C) 1999-2004 A.J. van Os; Released under GNU GPL
00004  *
00005  * Description:
00006  * Translate Word characters to local representation
00007  */
00008 
00009 #include <stdlib.h>
00010 #include <string.h>
00011 #include <ctype.h>
00012 #if defined(__STDC_ISO_10646__)
00013 #include <wctype.h>
00014 #endif /* __STDC_ISO_10646__ */
00015 #include "antiword.h"
00016 
00017 static const USHORT usCp850[] = {       /* DOS implementation of Latin1 */
00018         0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,
00019         0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
00020         0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,
00021         0x00ff, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x0192,
00022         0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,
00023         0x00bf, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
00024         0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x00c0,
00025         0x00a9, 0x2563, 0x2551, 0x2557, 0x255d, 0x00a2, 0x00a5, 0x2510,
00026         0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3,
00027         0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
00028         0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x0131, 0x00cd, 0x00ce,
00029         0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
00030         0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
00031         0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
00032         0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
00033         0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0,
00034 };
00035 
00036 static const USHORT usCp1250[] = {      /* Windows implementation of Latin2 */
00037         0x20ac, 0x003f, 0x201a, 0x003f, 0x201e, 0x2026, 0x2020, 0x2021,
00038         0x003f, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,
00039         0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
00040         0x003f, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,
00041         0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7,
00042         0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,
00043         0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
00044         0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,
00045         0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
00046         0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
00047         0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
00048         0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
00049         0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
00050         0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
00051         0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
00052         0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
00053 };
00054 
00055 static const USHORT usCp1251[] = {      /* Windows implementation of Cyrillic */
00056         0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,
00057         0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f,
00058         0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
00059         0x00f3, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,
00060         0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7,
00061         0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407,
00062         0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7,
00063         0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457,
00064         0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
00065         0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
00066         0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
00067         0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
00068         0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
00069         0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
00070         0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
00071         0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,
00072 };
00073 
00074 static const USHORT usCp1252[] = {      /* Windows implementation of Latin1 */
00075         0x20ac, 0x003f, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
00076         0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x003f, 0x017d, 0x003f,
00077         0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
00078         0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x003f, 0x017e, 0x0178,
00079         0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
00080         0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
00081         0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
00082         0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
00083         0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
00084         0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
00085         0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
00086         0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
00087         0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
00088         0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
00089         0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
00090         0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
00091 };
00092 
00093 static const USHORT usMacRoman[] = {    /* Apple implementation of Latin1 */
00094         0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1,
00095         0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8,
00096         0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3,
00097         0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc,
00098         0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df,
00099         0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8,
00100         0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211,
00101         0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x2126, 0x00e6, 0x00f8,
00102         0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab,
00103         0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153,
00104         0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca,
00105         0x00ff, 0x0178, 0x2044, 0x00a4, 0x2039, 0x203a, 0xfb01, 0xfb02,
00106         0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1,
00107         0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4,
00108         0x003f, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc,
00109         0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,
00110 };
00111 
00112 static const USHORT usPrivateArea[] = {
00113         0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220d,
00114         0x0028, 0x0029, 0x2217, 0x002b, 0x002c, 0x2212, 0x002e, 0x002f,
00115         0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
00116         0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x2019, 0x003e, 0x003f,
00117         0x201d, 0x201c, 0x0392, 0x03a7, 0x0394, 0x0395, 0x03a6, 0x0393,
00118         0x0397, 0x0399, 0x03d1, 0x039a, 0x039b, 0x039c, 0x039d, 0x039f,
00119         0x03a0, 0x0398, 0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03c2, 0x03a9,
00120         0x039e, 0x03a8, 0x0396, 0x005b, 0x2234, 0x005d, 0x22a5, 0x005f,
00121         0x003f, 0x03b1, 0x03b2, 0x03c7, 0x03b4, 0x03b5, 0x03c6, 0x03b3,
00122         0x03b7, 0x03b9, 0x03d5, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03bf,
00123         0x03c0, 0x03b8, 0x03c1, 0x03c3, 0x03c4, 0x03c5, 0x03d6, 0x03c9,
00124         0x03be, 0x03c8, 0x03b6, 0x007b, 0x007c, 0x007d, 0x223c, 0x003f,
00125         0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00126         0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00127         0x003f, 0x003f, 0x003f, 0x2022, 0x003f, 0x003f, 0x003f, 0x003f,
00128         0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
00129         0x20ac, 0x03d2, 0x2032, 0x2264, 0x2044, 0x221e, 0x0192, 0x2663,
00130         0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
00131         0x00b0, 0x00b1, 0x2033, 0x2265, 0x00d7, 0x221d, 0x2202, 0x2022,
00132         0x00f7, 0x2260, 0x2261, 0x2248, 0x2026, 0x007c, 0x23af, 0x21b5,
00133         0x2135, 0x2111, 0x211c, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
00134         0x222a, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
00135         0x2220, 0x2207, 0x00ae, 0x00a9, 0x2122, 0x220f, 0x221a, 0x22c5,
00136         0x00ac, 0x2227, 0x2228, 0x21d4, 0x21d0, 0x21d1, 0x21d2, 0x21d3,
00137         0x22c4, 0x3008, 0x00ae, 0x00a9, 0x2122, 0x2211, 0x239b, 0x239c,
00138         0x239d, 0x23a1, 0x23a2, 0x23a3, 0x23a7, 0x23a8, 0x23a9, 0x23aa,
00139         0x003f, 0x3009, 0x222b, 0x2320, 0x23ae, 0x2321, 0x239e, 0x239f,
00140         0x23a0, 0x23a4, 0x23a5, 0x23a6, 0x23ab, 0x23ac, 0x23ad, 0x003f,
00141 };
00142 
00143 typedef struct char_table_tag {
00144         UCHAR   ucLocal;
00145         USHORT  usUnicode;
00146 } char_table_type;
00147 
00148 static char_table_type  atCharTable[256];
00149 static size_t           tNextPosFree = 0;
00150 
00151 
00152 /*
00153  * iCompare - compare two records
00154  *
00155  * Compares two records. For use by qsort(3C) and bsearch(3C).
00156  *
00157  * returns -1 if rec1 < rec2, 0 if rec1 == rec2, 1 if rec1 > rec2
00158  */
00159 static int
00160 iCompare(const void *pvRecord1, const void *pvRecord2)
00161 {
00162         USHORT  usUnicode1, usUnicode2;
00163 
00164         usUnicode1 = ((char_table_type *)pvRecord1)->usUnicode;
00165         usUnicode2 = ((char_table_type *)pvRecord2)->usUnicode;
00166 
00167         if (usUnicode1 < usUnicode2) {
00168                 return -1;
00169         }
00170         if (usUnicode1 > usUnicode2) {
00171                 return 1;
00172         }
00173         return 0;
00174 } /* end of iCompare */
00175 
00176 /*
00177  * pGetCharTableRecord - get the character table record
00178  *
00179  * returns a pointer to the record when found, otherwise NULL
00180  */
00181 static const char_table_type *
00182 pGetCharTableRecord(USHORT usUnicode)
00183 {
00184         char_table_type tKey;
00185 
00186         if (tNextPosFree == 0) {
00187                 return NULL;
00188         }
00189         tKey.usUnicode = usUnicode;
00190         tKey.ucLocal = 0;
00191         return (char_table_type *)bsearch(&tKey,
00192                         atCharTable,
00193                         tNextPosFree, sizeof(atCharTable[0]),
00194                         iCompare);
00195 } /* end of pGetCharTableRecord */
00196 
00197 /*
00198  * ucGetBulletCharacter - get the local representation of the bullet
00199  */
00200 UCHAR
00201 ucGetBulletCharacter(conversion_type eConversionType, encoding_type eEncoding)
00202 {
00203 #if defined(__riscos)
00204         return 0x8f;
00205 #else
00206         const char_table_type   *pRec;
00207 
00208         fail(eEncoding == encoding_utf_8);
00209 
00210         if (eEncoding == encoding_latin_1 &&
00211             (eConversionType == conversion_ps ||
00212              eConversionType == conversion_pdf)) {
00213                 /* Ugly, but it makes the PostScript and PDF look better */
00214                 return (UCHAR)143;
00215         }
00216         if (eConversionType != conversion_text &&
00217             eConversionType != conversion_fmt_text) {
00218                 pRec = pGetCharTableRecord(UNICODE_BULLET);
00219                 if (pRec != NULL) {
00220                         return pRec->ucLocal;
00221                 }
00222                 pRec = pGetCharTableRecord(UNICODE_BULLET_OPERATOR);
00223                 if (pRec != NULL) {
00224                         return pRec->ucLocal;
00225                 }
00226                 pRec = pGetCharTableRecord(UNICODE_MIDDLE_DOT);
00227                 if (pRec != NULL) {
00228                         return pRec->ucLocal;
00229                 }
00230         }
00231         return (UCHAR)'.';
00232 #endif /* __riscos */
00233 } /* end of ucGetBulletCharacter */
00234 
00235 /*
00236  * ucGetNbspCharacter - get the local representation of the non-breaking space
00237  */
00238 UCHAR
00239 ucGetNbspCharacter(void)
00240 {
00241         const char_table_type   *pRec;
00242 
00243         pRec = pGetCharTableRecord(0x00a0);     /* Unicode non-breaking space */
00244         if (pRec == NULL) {
00245                 DBG_MSG("Non-breaking space record not found");
00246                 /* No value found, use the best guess */
00247                 return (UCHAR)0xa0;
00248         }
00249         return pRec->ucLocal;
00250 } /* end of ucGetNbspCharacter */
00251 
00252 /*
00253  * bReadCharacterMappingTable - read the mapping table
00254  *
00255  * Read the character mapping table from file and have the contents sorted
00256  *
00257  * returns TRUE if successful, otherwise FALSE
00258  */
00259 BOOL
00260 bReadCharacterMappingTable(FILE *pFile)
00261 {
00262         char    *pcTmp;
00263         ULONG   ulUnicode;
00264         UINT    uiLocal;
00265         int     iFields;
00266         char    szLine[81];
00267 
00268         if (pFile == NULL) {
00269                 return FALSE;
00270         }
00271 
00272         /* Clean the table first */
00273         (void)memset(atCharTable, 0, sizeof(atCharTable));
00274 
00275         /* Fill the table */
00276         while (fgets(szLine, (int)sizeof(szLine), pFile)) {
00277                 if (szLine[0] == '#' ||
00278                     szLine[0] == '\r' ||
00279                     szLine[0] == '\n') {
00280                         /* Comment or empty line */
00281                         continue;
00282                 }
00283                 iFields = sscanf(szLine, "%x %lx %*s", &uiLocal, &ulUnicode);
00284                 if (iFields != 2) {
00285                         pcTmp = strchr(szLine, '\r');
00286                         if (pcTmp != NULL) {
00287                                 *pcTmp = '\0';
00288                         }
00289                         pcTmp = strchr(szLine, '\n');
00290                         if (pcTmp != NULL) {
00291                                 *pcTmp = '\0';
00292                         }
00293                         werr(0, "Syntax error in: '%s'", szLine);
00294                         continue;
00295                 }
00296                 if (uiLocal > 0xff || ulUnicode > 0xffff) {
00297                         werr(0, "Syntax error in: '%02x %04lx'",
00298                                         uiLocal, ulUnicode);
00299                         continue;
00300                 }
00301                 /* Store only the relevant entries */
00302                 if (uiLocal != ulUnicode || uiLocal >= 0x80) {
00303                         atCharTable[tNextPosFree].ucLocal = (UCHAR)uiLocal;
00304                         atCharTable[tNextPosFree].usUnicode = (USHORT)ulUnicode;
00305                         tNextPosFree++;
00306                 }
00307                 if (tNextPosFree >= elementsof(atCharTable)) {
00308                         werr(0, "Too many entries in the character mapping "
00309                                 "file. Ignoring the rest.");
00310                         break;
00311                 }
00312         }
00313 
00314         if (tNextPosFree != 0) {
00315                 DBG_HEX(atCharTable[0].usUnicode);
00316                 DBG_HEX(atCharTable[tNextPosFree - 1].usUnicode);
00317 
00318                 qsort(atCharTable,
00319                         tNextPosFree, sizeof(atCharTable[0]),
00320                         iCompare);
00321 
00322                 DBG_HEX(atCharTable[0].usUnicode);
00323                 DBG_HEX(atCharTable[tNextPosFree - 1].usUnicode);
00324         }
00325 
00326         return TRUE;
00327 } /* end of bReadCharacterMappingTable */
00328 
00329 /*
00330  * ulTranslateCharacters - Translate characters to local representation
00331  *
00332  * Translate all characters to local representation
00333  *
00334  * returns the translated character
00335  */
00336 ULONG
00337 ulTranslateCharacters(USHORT usChar, ULONG ulFileOffset, int iWordVersion,
00338         conversion_type eConversionType, encoding_type eEncoding,
00339         BOOL bUseMacCharSet)
00340 {
00341         const char_table_type   *pTmp;
00342         const USHORT    *usCharSet;
00343 
00344         usCharSet = NULL;
00345         if (bUseMacCharSet) {
00346                 /* Macintosh character set */
00347                 usCharSet = usMacRoman;
00348         } else if (iWordVersion == 0) {
00349                 /* DOS character set */
00350                 usCharSet = usCp850;
00351         } else {
00352                 /* Windows character set */
00353                 switch (eEncoding) {
00354                 case encoding_latin_2:
00355                         usCharSet = usCp1250;
00356                         break;
00357                 case encoding_cyrillic:
00358                         usCharSet = usCp1251;
00359                         break;
00360                 case encoding_latin_1:
00361                 default:
00362                         usCharSet = usCp1252;
00363                         break;
00364                 }
00365         }
00366         fail(usCharSet == NULL);
00367         if (usChar >= 0x80 && usChar <= 0x9f) {
00368                 /* Translate implementation defined characters */
00369                 usChar = usCharSet[usChar - 0x80];
00370         } else if (iWordVersion < 8 && usChar >= 0xa0 && usChar <= 0xff) {
00371                 /* Translate old character set to Unixcode */
00372                 usChar = usCharSet[usChar - 0x80];
00373         }
00374 
00375         /* Microsoft Unicode to real Unicode */
00376         if (usChar >= 0xf020 && usChar <= 0xf0ff) {
00377                 DBG_HEX_C(usPrivateArea[usChar - 0xf020] == 0x003f, usChar);
00378                 usChar = usPrivateArea[usChar - 0xf020];
00379         }
00380 
00381         /* Characters with a special meaning in Word */
00382         switch (usChar) {
00383         case IGNORE_CHARACTER:
00384         case FOOTNOTE_SEPARATOR:
00385         case FOOTNOTE_CONTINUATION:
00386         case ANNOTATION:
00387         case FRAME:
00388         case LINE_FEED:
00389         case WORD_SOFT_HYPHEN:
00390         case UNICODE_HYPHENATION_POINT:
00391                 return IGNORE_CHARACTER;
00392         case PICTURE:
00393         case TABLE_SEPARATOR:
00394         case TAB:
00395         case HARD_RETURN:
00396         case PAGE_BREAK:
00397         case PAR_END:
00398         case COLUMN_FEED:
00399                 return (ULONG)usChar;
00400         case FOOTNOTE_OR_ENDNOTE:
00401                 NO_DBG_HEX(ulFileOffset);
00402                 switch (eGetNotetype(ulFileOffset)) {
00403                 case notetype_is_footnote:
00404                         return FOOTNOTE_CHAR;
00405                 case notetype_is_endnote:
00406                         return ENDNOTE_CHAR;
00407                 default:
00408                         return UNKNOWN_NOTE_CHAR;
00409                 }
00410         case WORD_UNBREAKABLE_JOIN:
00411                 return (ULONG)OUR_UNBREAKABLE_JOIN;
00412         default:
00413                 break;
00414         }
00415 
00416         if (eEncoding != encoding_utf_8) {
00417                 /* Latin characters in an oriental text */
00418                 if (usChar >= 0xff01 && usChar <= 0xff5e) {
00419                         usChar -= 0xfee0;
00420                 }
00421         }
00422 
00423         if (eEncoding == encoding_latin_1 &&
00424             (eConversionType == conversion_ps ||
00425              eConversionType == conversion_pdf)) {
00426                 /* Ugly, but it makes the PostScript and PDF look better */
00427                 switch (usChar) {
00428                 case UNICODE_ELLIPSIS:
00429                         return 140;
00430                 case UNICODE_TRADEMARK_SIGN:
00431                         return 141;
00432                 case UNICODE_PER_MILLE_SIGN:
00433                         return 142;
00434                 case UNICODE_BULLET:
00435                 case UNICODE_BULLET_OPERATOR:
00436                 case UNICODE_BLACK_CLUB_SUIT:
00437                         return 143;
00438                 case UNICODE_LEFT_SINGLE_QMARK:
00439                         return 144;
00440                 case UNICODE_RIGHT_SINGLE_QMARK:
00441                         return 145;
00442                 case UNICODE_SINGLE_LEFT_ANGLE_QMARK:
00443                         return 146;
00444                 case UNICODE_SINGLE_RIGHT_ANGLE_QMARK:
00445                         return 147;
00446                 case UNICODE_LEFT_DOUBLE_QMARK:
00447                         return 148;
00448                 case UNICODE_RIGHT_DOUBLE_QMARK:
00449                         return 149;
00450                 case UNICODE_DOUBLE_LOW_9_QMARK:
00451                         return 150;
00452                 case UNICODE_EN_DASH:
00453                         return 151;
00454                 case UNICODE_EM_DASH:
00455                         return 152;
00456                 case UNICODE_MINUS_SIGN:
00457                         return 153;
00458                 case UNICODE_CAPITAL_LIGATURE_OE:
00459                         return 154;
00460                 case UNICODE_SMALL_LIGATURE_OE:
00461                         return 155;
00462                 case UNICODE_DAGGER:
00463                         return 156;
00464                 case UNICODE_DOUBLE_DAGGER:
00465                         return 157;
00466                 case UNICODE_SMALL_LIGATURE_FI:
00467                         return 158;
00468                 case UNICODE_SMALL_LIGATURE_FL:
00469                         return 159;
00470                 default:
00471                         break;
00472                 }
00473         }
00474 
00475         if (eConversionType == conversion_pdf) {
00476                 if (eEncoding == encoding_latin_1) {
00477                         switch (usChar) {
00478                         case UNICODE_EURO_SIGN:
00479                                 return 128;
00480                         default:
00481                                 break;
00482                         }
00483                 } else if (eEncoding == encoding_latin_2) {
00484                         switch (usChar) {
00485                         case UNICODE_CAPITAL_D_WITH_STROKE:
00486                         case UNICODE_SMALL_D_WITH_STROKE:
00487                                 return 0x3f;
00488                         default:
00489                                 break;
00490                         }
00491                 }
00492         }
00493 
00494         if (usChar < 0x80) {
00495                 /* US ASCII */
00496                 if (usChar < 0x20 || usChar == 0x7f) {
00497                         /* Ignore control characters */
00498                         DBG_HEX(usChar);
00499                         DBG_FIXME();
00500                         return IGNORE_CHARACTER;
00501                 }
00502                 return (ULONG)usChar;
00503         }
00504 
00505         if (eEncoding == encoding_utf_8) {
00506                 /* No need to convert Unicode characters */
00507                 return (ULONG)usChar;
00508         }
00509 
00510         /* Unicode to local representation */
00511         pTmp = pGetCharTableRecord(usChar);
00512         if (pTmp != NULL) {
00513                 DBG_HEX_C(usChar >= 0x7f && usChar <= 0x9f, usChar);
00514                 return (ULONG)pTmp->ucLocal;
00515         }
00516 
00517         /* Fancy characters to simple US ASCII */
00518         switch (usChar) {
00519         case UNICODE_SMALL_F_HOOK:
00520                 return (ULONG)'f';
00521         case UNICODE_GREEK_CAPITAL_CHI:
00522                 return (ULONG)'X';
00523         case UNICODE_GREEK_SMALL_UPSILON:
00524                 return (ULONG)'v';
00525         case UNICODE_MODIFIER_CIRCUMFLEX:
00526         case UNICODE_UPWARDS_ARROW:
00527                 return (ULONG)'^';
00528         case UNICODE_SMALL_TILDE:
00529         case UNICODE_TILDE_OPERATOR:
00530                 return (ULONG)'~';
00531         case UNICODE_EN_QUAD:
00532         case UNICODE_EM_QUAD:
00533         case UNICODE_EN_SPACE:
00534         case UNICODE_EM_SPACE:
00535         case UNICODE_THREE_PER_EM_SPACE:
00536         case UNICODE_FOUR_PER_EM_SPACE:
00537         case UNICODE_SIX_PER_EM_SPACE:
00538         case UNICODE_FIGURE_SPACE:
00539         case UNICODE_PUNCTUATION_SPACE:
00540         case UNICODE_THIN_SPACE:
00541         case UNICODE_NARROW_NO_BREAK_SPACE:
00542         case UNICODE_LIGHT_SHADE:
00543         case UNICODE_MEDIUM_SHADE:
00544         case UNICODE_DARK_SHADE:
00545                 return (ULONG)' ';
00546         case UNICODE_LEFT_DOUBLE_QMARK:
00547         case UNICODE_RIGHT_DOUBLE_QMARK:
00548         case UNICODE_DOUBLE_LOW_9_QMARK:
00549         case UNICODE_DOUBLE_HIGH_REV_9_QMARK:
00550         case UNICODE_DOUBLE_PRIME:
00551                 return (ULONG)'"';
00552         case UNICODE_LEFT_SINGLE_QMARK:
00553         case UNICODE_RIGHT_SINGLE_QMARK:
00554         case UNICODE_SINGLE_LOW_9_QMARK:
00555         case UNICODE_SINGLE_HIGH_REV_9_QMARK:
00556         case UNICODE_PRIME:
00557                 return (ULONG)'\'';
00558         case UNICODE_HYPHEN:
00559         case UNICODE_NON_BREAKING_HYPHEN:
00560         case UNICODE_FIGURE_DASH:
00561         case UNICODE_EN_DASH:
00562         case UNICODE_EM_DASH:
00563         case UNICODE_HORIZONTAL_BAR:
00564         case UNICODE_MINUS_SIGN:
00565         case UNICODE_BD_LIGHT_HORIZONTAL:
00566         case UNICODE_BD_DOUBLE_HORIZONTAL:
00567                 return (ULONG)'-';
00568         case UNICODE_DOUBLE_VERTICAL_LINE:
00569         case UNICODE_BD_LIGHT_VERTICAL:
00570         case UNICODE_BD_DOUBLE_VERTICAL:
00571                 return (ULONG)'|';
00572         case UNICODE_DOUBLE_LOW_LINE:
00573                 return (ULONG)'_';
00574         case UNICODE_DAGGER:
00575                 return (ULONG)'+';
00576         case UNICODE_DOUBLE_DAGGER:
00577                 return (ULONG)'#';
00578         case UNICODE_BULLET:
00579         case UNICODE_BULLET_OPERATOR:
00580         case UNICODE_BLACK_CLUB_SUIT:
00581                 return (ULONG)ucGetBulletCharacter(eConversionType, eEncoding);
00582         case UNICODE_ONE_DOT_LEADER:
00583         case UNICODE_TWO_DOT_LEADER:
00584                 return (ULONG)'.';
00585         case UNICODE_ELLIPSIS:
00586 #if defined(__riscos)
00587                 return (ULONG)OUR_ELLIPSIS;
00588 #else
00589                 if (ulFileOffset == 0) {
00590                         return (ULONG)OUR_ELLIPSIS;
00591                 }
00592                 return UNICODE_ELLIPSIS;
00593 #endif /* __riscos */
00594         case UNICODE_DOUBLE_LEFT_ANGLE_QMARK:
00595         case UNICODE_TRIANGULAR_BULLET:
00596         case UNICODE_SINGLE_LEFT_ANGLE_QMARK:
00597         case UNICODE_LEFTWARDS_ARROW:
00598                 return (ULONG)'<';
00599         case UNICODE_DOUBLE_RIGHT_ANGLE_QMARK:
00600         case UNICODE_SINGLE_RIGHT_ANGLE_QMARK:
00601         case UNICODE_RIGHTWARDS_ARROW:
00602                 return (ULONG)'>';
00603         case UNICODE_UNDERTIE:
00604                 return (ULONG)'-';
00605         case UNICODE_N_ARY_SUMMATION:
00606                 return (ULONG)'S';
00607         case UNICODE_EURO_SIGN:
00608                 return (ULONG)'E';
00609         case UNICODE_CIRCLE:
00610         case UNICODE_SQUARE:
00611                 return (ULONG)'O';
00612         case UNICODE_DIAMOND:
00613                 return (ULONG)OUR_DIAMOND;
00614         case UNICODE_NUMERO_SIGN:
00615                 return (ULONG)'N';
00616         case UNICODE_KELVIN_SIGN:
00617                 return (ULONG)'K';
00618         case UNICODE_DOWNWARDS_ARROW:
00619                 return (ULONG)'v';
00620         case UNICODE_FRACTION_SLASH:
00621         case UNICODE_DIVISION_SLASH:
00622                 return (ULONG)'/';
00623         case UNICODE_ASTERISK_OPERATOR:
00624                 return (ULONG)'*';
00625         case UNICODE_RATIO:
00626                 return (ULONG)':';
00627         case UNICODE_BD_LIGHT_DOWN_RIGHT:
00628         case UNICODE_BD_LIGHT_DOWN_AND_LEFT:
00629         case UNICODE_BD_LIGHT_UP_AND_RIGHT:
00630         case UNICODE_BD_LIGHT_UP_AND_LEFT:
00631         case UNICODE_BD_LIGHT_VERTICAL_AND_RIGHT:
00632         case UNICODE_BD_LIGHT_VERTICAL_AND_LEFT:
00633         case UNICODE_BD_LIGHT_DOWN_AND_HORIZONTAL:
00634         case UNICODE_BD_LIGHT_UP_AND_HORIZONTAL:
00635         case UNICODE_BD_LIGHT_VERTICAL_AND_HORIZONTAL:
00636         case UNICODE_BD_DOUBLE_DOWN_AND_RIGHT:
00637         case UNICODE_BD_DOUBLE_DOWN_AND_LEFT:
00638         case UNICODE_BD_DOUBLE_UP_AND_RIGHT:
00639         case UNICODE_BD_DOUBLE_UP_AND_LEFT:
00640         case UNICODE_BD_DOUBLE_VERTICAL_AND_RIGHT:
00641         case UNICODE_BD_DOUBLE_VERTICAL_AND_LEFT:
00642         case UNICODE_BD_DOUBLE_DOWN_AND_HORIZONTAL:
00643         case UNICODE_BD_DOUBLE_UP_AND_HORIZONTAL:
00644         case UNICODE_BD_DOUBLE_VERTICAL_AND_HORIZONTAL:
00645         case UNICODE_BLACK_SQUARE:
00646                 return (ULONG)'+';
00647         case UNICODE_HAIR_SPACE:
00648         case UNICODE_ZERO_WIDTH_SPACE:
00649         case UNICODE_ZERO_WIDTH_NON_JOINER:
00650         case UNICODE_ZERO_WIDTH_JOINER:
00651         case UNICODE_LEFT_TO_RIGHT_MARK:
00652         case UNICODE_RIGHT_TO_LEFT_MARK:
00653         case UNICODE_LEFT_TO_RIGHT_EMBEDDING:
00654         case UNICODE_RIGHT_TO_LEFT_EMBEDDING:
00655         case UNICODE_POP_DIRECTIONAL_FORMATTING:
00656         case UNICODE_LEFT_TO_RIGHT_OVERRIDE:
00657         case UNICODE_RIGHT_TO_LEFT_OVERRIDE:
00658         case UNICODE_ZERO_WIDTH_NO_BREAK_SPACE:
00659                 return IGNORE_CHARACTER;
00660         default:
00661                 break;
00662         }
00663 
00664         if (usChar == UNICODE_TRADEMARK_SIGN) {
00665                 /*
00666                  * No local representation, it doesn't look like anything in
00667                  * US-ASCII and a question mark does more harm than good.
00668                  */
00669                 return IGNORE_CHARACTER;
00670         }
00671 
00672         if (usChar >= 0xa0 && usChar <= 0xff) {
00673                 /* Before Word 97, Word did't use Unicode */
00674                 return (ULONG)usChar;
00675         }
00676 
00677         DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, ulFileOffset);
00678         DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, usChar);
00679         DBG_MSG_C(usChar >= 0xe000 && usChar < 0xf900, "Private Use Area");
00680 
00681         /* Untranslated Unicode character */
00682         return 0x3f;
00683 } /* end of ulTranslateCharacters */
00684 
00685 /*
00686  * ulToUpper - convert letter to upper case
00687  *
00688  * This function converts a letter to upper case. Unlike toupper(3) this
00689  * function is independent from the settings of locale. This comes in handy
00690  * for people who have to read Word documents in more than one language or
00691  * contain more than one language.
00692  *
00693  * returns the converted letter, or ulChar if the conversion was not possible.
00694  */
00695 ULONG
00696 ulToUpper(ULONG ulChar)
00697 {
00698         if (ulChar < 0x80) {
00699                 /* US ASCII: use standard function */
00700                 return (ULONG)toupper((int)ulChar);
00701         }
00702         if (ulChar >= 0xe0 && ulChar <= 0xfe && ulChar != 0xf7) {
00703                 /*
00704                  * Lower case accented characters
00705                  * 0xf7 is Division sign; 0xd7 is Multiplication sign
00706                  * 0xff is y with diaeresis; 0xdf is Sharp s
00707                  */
00708                 return ulChar & ~0x20;
00709         }
00710 #if defined(__STDC_ISO_10646__)
00711         /*
00712          * If this is ISO C99 and all locales have wchar_t = ISO 10646
00713          * (e.g., glibc 2.2 or newer), then use standard function
00714          */
00715         if (ulChar > 0xff) {
00716                 return (ULONG)towupper((wint_t)ulChar);
00717         }
00718 #endif /* __STDC_ISO_10646__ */
00719         return ulChar;
00720 } /* end of ulToUpper */

Generated by  doxygen 1.6.2