Header And Logo

PostgreSQL
| The world's most advanced open source database.

ts_locale.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * ts_locale.c
00004  *      locale compatibility layer for tsearch
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  *
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/tsearch/ts_locale.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 #include "postgres.h"
00015 
00016 #include "catalog/pg_collation.h"
00017 #include "storage/fd.h"
00018 #include "tsearch/ts_locale.h"
00019 #include "tsearch/ts_public.h"
00020 
00021 static void tsearch_readline_callback(void *arg);
00022 
00023 
00024 #ifdef USE_WIDE_UPPER_LOWER
00025 
00026 int
00027 t_isdigit(const char *ptr)
00028 {
00029     int         clen = pg_mblen(ptr);
00030     wchar_t     character[2];
00031     Oid         collation = DEFAULT_COLLATION_OID;      /* TODO */
00032     pg_locale_t mylocale = 0;   /* TODO */
00033 
00034     if (clen == 1 || lc_ctype_is_c(collation))
00035         return isdigit(TOUCHAR(ptr));
00036 
00037     char2wchar(character, 2, ptr, clen, mylocale);
00038 
00039     return iswdigit((wint_t) character[0]);
00040 }
00041 
00042 int
00043 t_isspace(const char *ptr)
00044 {
00045     int         clen = pg_mblen(ptr);
00046     wchar_t     character[2];
00047     Oid         collation = DEFAULT_COLLATION_OID;      /* TODO */
00048     pg_locale_t mylocale = 0;   /* TODO */
00049 
00050     if (clen == 1 || lc_ctype_is_c(collation))
00051         return isspace(TOUCHAR(ptr));
00052 
00053     char2wchar(character, 2, ptr, clen, mylocale);
00054 
00055     return iswspace((wint_t) character[0]);
00056 }
00057 
00058 int
00059 t_isalpha(const char *ptr)
00060 {
00061     int         clen = pg_mblen(ptr);
00062     wchar_t     character[2];
00063     Oid         collation = DEFAULT_COLLATION_OID;      /* TODO */
00064     pg_locale_t mylocale = 0;   /* TODO */
00065 
00066     if (clen == 1 || lc_ctype_is_c(collation))
00067         return isalpha(TOUCHAR(ptr));
00068 
00069     char2wchar(character, 2, ptr, clen, mylocale);
00070 
00071     return iswalpha((wint_t) character[0]);
00072 }
00073 
00074 int
00075 t_isprint(const char *ptr)
00076 {
00077     int         clen = pg_mblen(ptr);
00078     wchar_t     character[2];
00079     Oid         collation = DEFAULT_COLLATION_OID;      /* TODO */
00080     pg_locale_t mylocale = 0;   /* TODO */
00081 
00082     if (clen == 1 || lc_ctype_is_c(collation))
00083         return isprint(TOUCHAR(ptr));
00084 
00085     char2wchar(character, 2, ptr, clen, mylocale);
00086 
00087     return iswprint((wint_t) character[0]);
00088 }
00089 #endif   /* USE_WIDE_UPPER_LOWER */
00090 
00091 
00092 /*
00093  * Set up to read a file using tsearch_readline().  This facility is
00094  * better than just reading the file directly because it provides error
00095  * context pointing to the specific line where a problem is detected.
00096  *
00097  * Expected usage is:
00098  *
00099  *      tsearch_readline_state trst;
00100  *
00101  *      if (!tsearch_readline_begin(&trst, filename))
00102  *          ereport(ERROR,
00103  *                  (errcode(ERRCODE_CONFIG_FILE_ERROR),
00104  *                   errmsg("could not open stop-word file \"%s\": %m",
00105  *                          filename)));
00106  *      while ((line = tsearch_readline(&trst)) != NULL)
00107  *          process line;
00108  *      tsearch_readline_end(&trst);
00109  *
00110  * Note that the caller supplies the ereport() for file open failure;
00111  * this is so that a custom message can be provided.  The filename string
00112  * passed to tsearch_readline_begin() must remain valid through
00113  * tsearch_readline_end().
00114  */
00115 bool
00116 tsearch_readline_begin(tsearch_readline_state *stp,
00117                        const char *filename)
00118 {
00119     if ((stp->fp = AllocateFile(filename, "r")) == NULL)
00120         return false;
00121     stp->filename = filename;
00122     stp->lineno = 0;
00123     stp->curline = NULL;
00124     /* Setup error traceback support for ereport() */
00125     stp->cb.callback = tsearch_readline_callback;
00126     stp->cb.arg = (void *) stp;
00127     stp->cb.previous = error_context_stack;
00128     error_context_stack = &stp->cb;
00129     return true;
00130 }
00131 
00132 /*
00133  * Read the next line from a tsearch data file (expected to be in UTF-8), and
00134  * convert it to database encoding if needed. The returned string is palloc'd.
00135  * NULL return means EOF.
00136  */
00137 char *
00138 tsearch_readline(tsearch_readline_state *stp)
00139 {
00140     char       *result;
00141 
00142     stp->lineno++;
00143     stp->curline = NULL;
00144     result = t_readline(stp->fp);
00145     stp->curline = result;
00146     return result;
00147 }
00148 
00149 /*
00150  * Close down after reading a file with tsearch_readline()
00151  */
00152 void
00153 tsearch_readline_end(tsearch_readline_state *stp)
00154 {
00155     FreeFile(stp->fp);
00156     /* Pop the error context stack */
00157     error_context_stack = stp->cb.previous;
00158 }
00159 
00160 /*
00161  * Error context callback for errors occurring while reading a tsearch
00162  * configuration file.
00163  */
00164 static void
00165 tsearch_readline_callback(void *arg)
00166 {
00167     tsearch_readline_state *stp = (tsearch_readline_state *) arg;
00168 
00169     /*
00170      * We can't include the text of the config line for errors that occur
00171      * during t_readline() itself.  This is only partly a consequence of our
00172      * arms-length use of that routine: the major cause of such errors is
00173      * encoding violations, and we daren't try to print error messages
00174      * containing badly-encoded data.
00175      */
00176     if (stp->curline)
00177         errcontext("line %d of configuration file \"%s\": \"%s\"",
00178                    stp->lineno,
00179                    stp->filename,
00180                    stp->curline);
00181     else
00182         errcontext("line %d of configuration file \"%s\"",
00183                    stp->lineno,
00184                    stp->filename);
00185 }
00186 
00187 
00188 /*
00189  * Read the next line from a tsearch data file (expected to be in UTF-8), and
00190  * convert it to database encoding if needed. The returned string is palloc'd.
00191  * NULL return means EOF.
00192  *
00193  * Note: direct use of this function is now deprecated.  Go through
00194  * tsearch_readline() to provide better error reporting.
00195  */
00196 char *
00197 t_readline(FILE *fp)
00198 {
00199     int         len;
00200     char       *recoded;
00201     char        buf[4096];      /* lines must not be longer than this */
00202 
00203     if (fgets(buf, sizeof(buf), fp) == NULL)
00204         return NULL;
00205 
00206     len = strlen(buf);
00207 
00208     /* Make sure the input is valid UTF-8 */
00209     (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
00210 
00211     /* And convert */
00212     recoded = (char *) pg_do_encoding_conversion((unsigned char *) buf,
00213                                                  len,
00214                                                  PG_UTF8,
00215                                                  GetDatabaseEncoding());
00216     if (recoded == buf)
00217     {
00218         /*
00219          * conversion didn't pstrdup, so we must. We can use the length of the
00220          * original string, because no conversion was done.
00221          */
00222         recoded = pnstrdup(recoded, len);
00223     }
00224 
00225     return recoded;
00226 }
00227 
00228 /*
00229  * lowerstr --- fold null-terminated string to lower case
00230  *
00231  * Returned string is palloc'd
00232  */
00233 char *
00234 lowerstr(const char *str)
00235 {
00236     return lowerstr_with_len(str, strlen(str));
00237 }
00238 
00239 /*
00240  * lowerstr_with_len --- fold string to lower case
00241  *
00242  * Input string need not be null-terminated.
00243  *
00244  * Returned string is palloc'd
00245  */
00246 char *
00247 lowerstr_with_len(const char *str, int len)
00248 {
00249     char       *out;
00250 
00251 #ifdef USE_WIDE_UPPER_LOWER
00252     Oid         collation = DEFAULT_COLLATION_OID;      /* TODO */
00253     pg_locale_t mylocale = 0;   /* TODO */
00254 #endif
00255 
00256     if (len == 0)
00257         return pstrdup("");
00258 
00259 #ifdef USE_WIDE_UPPER_LOWER
00260 
00261     /*
00262      * Use wide char code only when max encoding length > 1 and ctype != C.
00263      * Some operating systems fail with multi-byte encodings and a C locale.
00264      * Also, for a C locale there is no need to process as multibyte. From
00265      * backend/utils/adt/oracle_compat.c Teodor
00266      */
00267     if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
00268     {
00269         wchar_t    *wstr,
00270                    *wptr;
00271         int         wlen;
00272 
00273         /*
00274          * alloc number of wchar_t for worst case, len contains number of
00275          * bytes >= number of characters and alloc 1 wchar_t for 0, because
00276          * wchar2char wants zero-terminated string
00277          */
00278         wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
00279 
00280         wlen = char2wchar(wstr, len + 1, str, len, mylocale);
00281         Assert(wlen <= len);
00282 
00283         while (*wptr)
00284         {
00285             *wptr = towlower((wint_t) *wptr);
00286             wptr++;
00287         }
00288 
00289         /*
00290          * Alloc result string for worst case + '\0'
00291          */
00292         len = pg_database_encoding_max_length() * wlen + 1;
00293         out = (char *) palloc(len);
00294 
00295         wlen = wchar2char(out, wstr, len, mylocale);
00296 
00297         pfree(wstr);
00298 
00299         if (wlen < 0)
00300             ereport(ERROR,
00301                     (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
00302             errmsg("conversion from wchar_t to server encoding failed: %m")));
00303         Assert(wlen < len);
00304     }
00305     else
00306 #endif   /* USE_WIDE_UPPER_LOWER */
00307     {
00308         const char *ptr = str;
00309         char       *outptr;
00310 
00311         outptr = out = (char *) palloc(sizeof(char) * (len + 1));
00312         while ((ptr - str) < len && *ptr)
00313         {
00314             *outptr++ = tolower(TOUCHAR(ptr));
00315             ptr++;
00316         }
00317         *outptr = '\0';
00318     }
00319 
00320     return out;
00321 }