Header And Logo

PostgreSQL
| The world's most advanced open source database.

stringutils.c

Go to the documentation of this file.
00001 /*
00002  * psql - the PostgreSQL interactive terminal
00003  *
00004  * Copyright (c) 2000-2013, PostgreSQL Global Development Group
00005  *
00006  * src/bin/psql/stringutils.c
00007  */
00008 #include "postgres_fe.h"
00009 
00010 #include <ctype.h>
00011 
00012 #include "common.h"
00013 #include "stringutils.h"
00014 
00015 
00016 /*
00017  * Replacement for strtok() (a.k.a. poor man's flex)
00018  *
00019  * Splits a string into tokens, returning one token per call, then NULL
00020  * when no more tokens exist in the given string.
00021  *
00022  * The calling convention is similar to that of strtok, but with more
00023  * frammishes.
00024  *
00025  * s -          string to parse, if NULL continue parsing the last string
00026  * whitespace - set of whitespace characters that separate tokens
00027  * delim -      set of non-whitespace separator characters (or NULL)
00028  * quote -      set of characters that can quote a token (NULL if none)
00029  * escape -     character that can quote quotes (0 if none)
00030  * e_strings -  if TRUE, treat E'...' syntax as a valid token
00031  * del_quotes - if TRUE, strip quotes from the returned token, else return
00032  *              it exactly as found in the string
00033  * encoding -   the active character-set encoding
00034  *
00035  * Characters in 'delim', if any, will be returned as single-character
00036  * tokens unless part of a quoted token.
00037  *
00038  * Double occurrences of the quoting character are always taken to represent
00039  * a single quote character in the data.  If escape isn't 0, then escape
00040  * followed by anything (except \0) is a data character too.
00041  *
00042  * The combination of e_strings and del_quotes both TRUE is not currently
00043  * handled.  This could be fixed but it's not needed anywhere at the moment.
00044  *
00045  * Note that the string s is _not_ overwritten in this implementation.
00046  *
00047  * NB: it's okay to vary delim, quote, and escape from one call to the
00048  * next on a single source string, but changing whitespace is a bad idea
00049  * since you might lose data.
00050  */
00051 char *
00052 strtokx(const char *s,
00053         const char *whitespace,
00054         const char *delim,
00055         const char *quote,
00056         char escape,
00057         bool e_strings,
00058         bool del_quotes,
00059         int encoding)
00060 {
00061     static char *storage = NULL;/* store the local copy of the users string
00062                                  * here */
00063     static char *string = NULL; /* pointer into storage where to continue on
00064                                  * next call */
00065 
00066     /* variously abused variables: */
00067     unsigned int offset;
00068     char       *start;
00069     char       *p;
00070 
00071     if (s)
00072     {
00073         free(storage);
00074 
00075         /*
00076          * We may need extra space to insert delimiter nulls for adjacent
00077          * tokens.  2X the space is a gross overestimate, but it's unlikely
00078          * that this code will be used on huge strings anyway.
00079          */
00080         storage = pg_malloc(2 * strlen(s) + 1);
00081         strcpy(storage, s);
00082         string = storage;
00083     }
00084 
00085     if (!storage)
00086         return NULL;
00087 
00088     /* skip leading whitespace */
00089     offset = strspn(string, whitespace);
00090     start = &string[offset];
00091 
00092     /* end of string reached? */
00093     if (*start == '\0')
00094     {
00095         /* technically we don't need to free here, but we're nice */
00096         free(storage);
00097         storage = NULL;
00098         string = NULL;
00099         return NULL;
00100     }
00101 
00102     /* test if delimiter character */
00103     if (delim && strchr(delim, *start))
00104     {
00105         /*
00106          * If not at end of string, we need to insert a null to terminate the
00107          * returned token.  We can just overwrite the next character if it
00108          * happens to be in the whitespace set ... otherwise move over the
00109          * rest of the string to make room.  (This is why we allocated extra
00110          * space above).
00111          */
00112         p = start + 1;
00113         if (*p != '\0')
00114         {
00115             if (!strchr(whitespace, *p))
00116                 memmove(p + 1, p, strlen(p) + 1);
00117             *p = '\0';
00118             string = p + 1;
00119         }
00120         else
00121         {
00122             /* at end of string, so no extra work */
00123             string = p;
00124         }
00125 
00126         return start;
00127     }
00128 
00129     /* check for E string */
00130     p = start;
00131     if (e_strings &&
00132         (*p == 'E' || *p == 'e') &&
00133         p[1] == '\'')
00134     {
00135         quote = "'";
00136         escape = '\\';          /* if std strings before, not any more */
00137         p++;
00138     }
00139 
00140     /* test if quoting character */
00141     if (quote && strchr(quote, *p))
00142     {
00143         /* okay, we have a quoted token, now scan for the closer */
00144         char        thisquote = *p++;
00145 
00146         for (; *p; p += PQmblen(p, encoding))
00147         {
00148             if (*p == escape && p[1] != '\0')
00149                 p++;            /* process escaped anything */
00150             else if (*p == thisquote && p[1] == thisquote)
00151                 p++;            /* process doubled quote */
00152             else if (*p == thisquote)
00153             {
00154                 p++;            /* skip trailing quote */
00155                 break;
00156             }
00157         }
00158 
00159         /*
00160          * If not at end of string, we need to insert a null to terminate the
00161          * returned token.  See notes above.
00162          */
00163         if (*p != '\0')
00164         {
00165             if (!strchr(whitespace, *p))
00166                 memmove(p + 1, p, strlen(p) + 1);
00167             *p = '\0';
00168             string = p + 1;
00169         }
00170         else
00171         {
00172             /* at end of string, so no extra work */
00173             string = p;
00174         }
00175 
00176         /* Clean up the token if caller wants that */
00177         if (del_quotes)
00178             strip_quotes(start, thisquote, escape, encoding);
00179 
00180         return start;
00181     }
00182 
00183     /*
00184      * Otherwise no quoting character.  Scan till next whitespace, delimiter
00185      * or quote.  NB: at this point, *start is known not to be '\0',
00186      * whitespace, delim, or quote, so we will consume at least one character.
00187      */
00188     offset = strcspn(start, whitespace);
00189 
00190     if (delim)
00191     {
00192         unsigned int offset2 = strcspn(start, delim);
00193 
00194         if (offset > offset2)
00195             offset = offset2;
00196     }
00197 
00198     if (quote)
00199     {
00200         unsigned int offset2 = strcspn(start, quote);
00201 
00202         if (offset > offset2)
00203             offset = offset2;
00204     }
00205 
00206     p = start + offset;
00207 
00208     /*
00209      * If not at end of string, we need to insert a null to terminate the
00210      * returned token.  See notes above.
00211      */
00212     if (*p != '\0')
00213     {
00214         if (!strchr(whitespace, *p))
00215             memmove(p + 1, p, strlen(p) + 1);
00216         *p = '\0';
00217         string = p + 1;
00218     }
00219     else
00220     {
00221         /* at end of string, so no extra work */
00222         string = p;
00223     }
00224 
00225     return start;
00226 }
00227 
00228 
00229 /*
00230  * strip_quotes
00231  *
00232  * Remove quotes from the string at *source.  Leading and trailing occurrences
00233  * of 'quote' are removed; embedded double occurrences of 'quote' are reduced
00234  * to single occurrences; if 'escape' is not 0 then 'escape' removes special
00235  * significance of next character.
00236  *
00237  * Note that the source string is overwritten in-place.
00238  */
00239 void
00240 strip_quotes(char *source, char quote, char escape, int encoding)
00241 {
00242     char       *src;
00243     char       *dst;
00244 
00245     Assert(source != NULL);
00246     Assert(quote != '\0');
00247 
00248     src = dst = source;
00249 
00250     if (*src && *src == quote)
00251         src++;                  /* skip leading quote */
00252 
00253     while (*src)
00254     {
00255         char        c = *src;
00256         int         i;
00257 
00258         if (c == quote && src[1] == '\0')
00259             break;              /* skip trailing quote */
00260         else if (c == quote && src[1] == quote)
00261             src++;              /* process doubled quote */
00262         else if (c == escape && src[1] != '\0')
00263             src++;              /* process escaped character */
00264 
00265         i = PQmblen(src, encoding);
00266         while (i--)
00267             *dst++ = *src++;
00268     }
00269 
00270     *dst = '\0';
00271 }
00272 
00273 
00274 /*
00275  * quote_if_needed
00276  *
00277  * Opposite of strip_quotes().  If "source" denotes itself literally without
00278  * quoting or escaping, returns NULL.  Otherwise, returns a malloc'd copy with
00279  * quoting and escaping applied:
00280  *
00281  * source -         string to parse
00282  * entails_quote -  any of these present?  need outer quotes
00283  * quote -          doubled within string, affixed to both ends
00284  * escape -         doubled within string
00285  * encoding -       the active character-set encoding
00286  *
00287  * Do not use this as a substitute for PQescapeStringConn().  Use it for
00288  * strings to be parsed by strtokx() or psql_scan_slash_option().
00289  */
00290 char *
00291 quote_if_needed(const char *source, const char *entails_quote,
00292                 char quote, char escape, int encoding)
00293 {
00294     const char *src;
00295     char       *ret;
00296     char       *dst;
00297     bool        need_quotes = false;
00298 
00299     Assert(source != NULL);
00300     Assert(quote != '\0');
00301 
00302     src = source;
00303     dst = ret = pg_malloc(2 * strlen(src) + 3); /* excess */
00304 
00305     *dst++ = quote;
00306 
00307     while (*src)
00308     {
00309         char        c = *src;
00310         int         i;
00311 
00312         if (c == quote)
00313         {
00314             need_quotes = true;
00315             *dst++ = quote;
00316         }
00317         else if (c == escape)
00318         {
00319             need_quotes = true;
00320             *dst++ = escape;
00321         }
00322         else if (strchr(entails_quote, c))
00323             need_quotes = true;
00324 
00325         i = PQmblen(src, encoding);
00326         while (i--)
00327             *dst++ = *src++;
00328     }
00329 
00330     *dst++ = quote;
00331     *dst = '\0';
00332 
00333     if (!need_quotes)
00334     {
00335         free(ret);
00336         ret = NULL;
00337     }
00338 
00339     return ret;
00340 }