Header And Logo

PostgreSQL
| The world's most advanced open source database.

tsvector_parser.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * tsvector_parser.c
00004  *    Parser for tsvector
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  *
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/utils/adt/tsvector_parser.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 
00015 #include "postgres.h"
00016 
00017 #include "tsearch/ts_locale.h"
00018 #include "tsearch/ts_utils.h"
00019 
00020 
00021 /*
00022  * Private state of tsvector parser.  Note that tsquery also uses this code to
00023  * parse its input, hence the boolean flags.  The two flags are both true or
00024  * both false in current usage, but we keep them separate for clarity.
00025  * is_tsquery affects *only* the content of error messages.
00026  */
00027 struct TSVectorParseStateData
00028 {
00029     char       *prsbuf;         /* next input character */
00030     char       *bufstart;       /* whole string (used only for errors) */
00031     char       *word;           /* buffer to hold the current word */
00032     int         len;            /* size in bytes allocated for 'word' */
00033     int         eml;            /* max bytes per character */
00034     bool        oprisdelim;     /* treat ! | * ( ) as delimiters? */
00035     bool        is_tsquery;     /* say "tsquery" not "tsvector" in errors? */
00036 };
00037 
00038 
00039 /*
00040  * Initializes parser for the input string. If oprisdelim is set, the
00041  * following characters are treated as delimiters in addition to whitespace:
00042  * ! | & ( )
00043  */
00044 TSVectorParseState
00045 init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
00046 {
00047     TSVectorParseState state;
00048 
00049     state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
00050     state->prsbuf = input;
00051     state->bufstart = input;
00052     state->len = 32;
00053     state->word = (char *) palloc(state->len);
00054     state->eml = pg_database_encoding_max_length();
00055     state->oprisdelim = oprisdelim;
00056     state->is_tsquery = is_tsquery;
00057 
00058     return state;
00059 }
00060 
00061 /*
00062  * Reinitializes parser to parse 'input', instead of previous input.
00063  */
00064 void
00065 reset_tsvector_parser(TSVectorParseState state, char *input)
00066 {
00067     state->prsbuf = input;
00068 }
00069 
00070 /*
00071  * Shuts down a tsvector parser.
00072  */
00073 void
00074 close_tsvector_parser(TSVectorParseState state)
00075 {
00076     pfree(state->word);
00077     pfree(state);
00078 }
00079 
00080 /* increase the size of 'word' if needed to hold one more character */
00081 #define RESIZEPRSBUF \
00082 do { \
00083     int clen = curpos - state->word; \
00084     if ( clen + state->eml >= state->len ) \
00085     { \
00086         state->len *= 2; \
00087         state->word = (char *) repalloc(state->word, state->len); \
00088         curpos = state->word + clen; \
00089     } \
00090 } while (0)
00091 
00092 #define ISOPERATOR(x)   ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
00093 
00094 /* Fills gettoken_tsvector's output parameters, and returns true */
00095 #define RETURN_TOKEN \
00096 do { \
00097     if (pos_ptr != NULL) \
00098     { \
00099         *pos_ptr = pos; \
00100         *poslen = npos; \
00101     } \
00102     else if (pos != NULL) \
00103         pfree(pos); \
00104     \
00105     if (strval != NULL) \
00106         *strval = state->word; \
00107     if (lenval != NULL) \
00108         *lenval = curpos - state->word; \
00109     if (endptr != NULL) \
00110         *endptr = state->prsbuf; \
00111     return true; \
00112 } while(0)
00113 
00114 
00115 /* State codes used in gettoken_tsvector */
00116 #define WAITWORD        1
00117 #define WAITENDWORD     2
00118 #define WAITNEXTCHAR    3
00119 #define WAITENDCMPLX    4
00120 #define WAITPOSINFO     5
00121 #define INPOSINFO       6
00122 #define WAITPOSDELIM    7
00123 #define WAITCHARCMPLX   8
00124 
00125 #define PRSSYNTAXERROR prssyntaxerror(state)
00126 
00127 static void
00128 prssyntaxerror(TSVectorParseState state)
00129 {
00130     ereport(ERROR,
00131             (errcode(ERRCODE_SYNTAX_ERROR),
00132              state->is_tsquery ?
00133              errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
00134              errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
00135 }
00136 
00137 
00138 /*
00139  * Get next token from string being parsed. Returns true if successful,
00140  * false if end of input string is reached.  On success, these output
00141  * parameters are filled in:
00142  *
00143  * *strval      pointer to token
00144  * *lenval      length of *strval
00145  * *pos_ptr     pointer to a palloc'd array of positions and weights
00146  *              associated with the token. If the caller is not interested
00147  *              in the information, NULL can be supplied. Otherwise
00148  *              the caller is responsible for pfreeing the array.
00149  * *poslen      number of elements in *pos_ptr
00150  * *endptr      scan resumption point
00151  *
00152  * Pass NULL for unwanted output parameters.
00153  */
00154 bool
00155 gettoken_tsvector(TSVectorParseState state,
00156                   char **strval, int *lenval,
00157                   WordEntryPos **pos_ptr, int *poslen,
00158                   char **endptr)
00159 {
00160     int         oldstate = 0;
00161     char       *curpos = state->word;
00162     int         statecode = WAITWORD;
00163 
00164     /*
00165      * pos is for collecting the comma delimited list of positions followed by
00166      * the actual token.
00167      */
00168     WordEntryPos *pos = NULL;
00169     int         npos = 0;       /* elements of pos used */
00170     int         posalen = 0;    /* allocated size of pos */
00171 
00172     while (1)
00173     {
00174         if (statecode == WAITWORD)
00175         {
00176             if (*(state->prsbuf) == '\0')
00177                 return false;
00178             else if (t_iseq(state->prsbuf, '\''))
00179                 statecode = WAITENDCMPLX;
00180             else if (t_iseq(state->prsbuf, '\\'))
00181             {
00182                 statecode = WAITNEXTCHAR;
00183                 oldstate = WAITENDWORD;
00184             }
00185             else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
00186                 PRSSYNTAXERROR;
00187             else if (!t_isspace(state->prsbuf))
00188             {
00189                 COPYCHAR(curpos, state->prsbuf);
00190                 curpos += pg_mblen(state->prsbuf);
00191                 statecode = WAITENDWORD;
00192             }
00193         }
00194         else if (statecode == WAITNEXTCHAR)
00195         {
00196             if (*(state->prsbuf) == '\0')
00197                 ereport(ERROR,
00198                         (errcode(ERRCODE_SYNTAX_ERROR),
00199                          errmsg("there is no escaped character: \"%s\"",
00200                                 state->bufstart)));
00201             else
00202             {
00203                 RESIZEPRSBUF;
00204                 COPYCHAR(curpos, state->prsbuf);
00205                 curpos += pg_mblen(state->prsbuf);
00206                 Assert(oldstate != 0);
00207                 statecode = oldstate;
00208             }
00209         }
00210         else if (statecode == WAITENDWORD)
00211         {
00212             if (t_iseq(state->prsbuf, '\\'))
00213             {
00214                 statecode = WAITNEXTCHAR;
00215                 oldstate = WAITENDWORD;
00216             }
00217             else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
00218                      (state->oprisdelim && ISOPERATOR(state->prsbuf)))
00219             {
00220                 RESIZEPRSBUF;
00221                 if (curpos == state->word)
00222                     PRSSYNTAXERROR;
00223                 *(curpos) = '\0';
00224                 RETURN_TOKEN;
00225             }
00226             else if (t_iseq(state->prsbuf, ':'))
00227             {
00228                 if (curpos == state->word)
00229                     PRSSYNTAXERROR;
00230                 *(curpos) = '\0';
00231                 if (state->oprisdelim)
00232                     RETURN_TOKEN;
00233                 else
00234                     statecode = INPOSINFO;
00235             }
00236             else
00237             {
00238                 RESIZEPRSBUF;
00239                 COPYCHAR(curpos, state->prsbuf);
00240                 curpos += pg_mblen(state->prsbuf);
00241             }
00242         }
00243         else if (statecode == WAITENDCMPLX)
00244         {
00245             if (t_iseq(state->prsbuf, '\''))
00246             {
00247                 statecode = WAITCHARCMPLX;
00248             }
00249             else if (t_iseq(state->prsbuf, '\\'))
00250             {
00251                 statecode = WAITNEXTCHAR;
00252                 oldstate = WAITENDCMPLX;
00253             }
00254             else if (*(state->prsbuf) == '\0')
00255                 PRSSYNTAXERROR;
00256             else
00257             {
00258                 RESIZEPRSBUF;
00259                 COPYCHAR(curpos, state->prsbuf);
00260                 curpos += pg_mblen(state->prsbuf);
00261             }
00262         }
00263         else if (statecode == WAITCHARCMPLX)
00264         {
00265             if (t_iseq(state->prsbuf, '\''))
00266             {
00267                 RESIZEPRSBUF;
00268                 COPYCHAR(curpos, state->prsbuf);
00269                 curpos += pg_mblen(state->prsbuf);
00270                 statecode = WAITENDCMPLX;
00271             }
00272             else
00273             {
00274                 RESIZEPRSBUF;
00275                 *(curpos) = '\0';
00276                 if (curpos == state->word)
00277                     PRSSYNTAXERROR;
00278                 if (state->oprisdelim)
00279                 {
00280                     /* state->prsbuf+=pg_mblen(state->prsbuf); */
00281                     RETURN_TOKEN;
00282                 }
00283                 else
00284                     statecode = WAITPOSINFO;
00285                 continue;       /* recheck current character */
00286             }
00287         }
00288         else if (statecode == WAITPOSINFO)
00289         {
00290             if (t_iseq(state->prsbuf, ':'))
00291                 statecode = INPOSINFO;
00292             else
00293                 RETURN_TOKEN;
00294         }
00295         else if (statecode == INPOSINFO)
00296         {
00297             if (t_isdigit(state->prsbuf))
00298             {
00299                 if (posalen == 0)
00300                 {
00301                     posalen = 4;
00302                     pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
00303                     npos = 0;
00304                 }
00305                 else if (npos + 1 >= posalen)
00306                 {
00307                     posalen *= 2;
00308                     pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
00309                 }
00310                 npos++;
00311                 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
00312                 /* we cannot get here in tsquery, so no need for 2 errmsgs */
00313                 if (WEP_GETPOS(pos[npos - 1]) == 0)
00314                     ereport(ERROR,
00315                             (errcode(ERRCODE_SYNTAX_ERROR),
00316                              errmsg("wrong position info in tsvector: \"%s\"",
00317                                     state->bufstart)));
00318                 WEP_SETWEIGHT(pos[npos - 1], 0);
00319                 statecode = WAITPOSDELIM;
00320             }
00321             else
00322                 PRSSYNTAXERROR;
00323         }
00324         else if (statecode == WAITPOSDELIM)
00325         {
00326             if (t_iseq(state->prsbuf, ','))
00327                 statecode = INPOSINFO;
00328             else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
00329             {
00330                 if (WEP_GETWEIGHT(pos[npos - 1]))
00331                     PRSSYNTAXERROR;
00332                 WEP_SETWEIGHT(pos[npos - 1], 3);
00333             }
00334             else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
00335             {
00336                 if (WEP_GETWEIGHT(pos[npos - 1]))
00337                     PRSSYNTAXERROR;
00338                 WEP_SETWEIGHT(pos[npos - 1], 2);
00339             }
00340             else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
00341             {
00342                 if (WEP_GETWEIGHT(pos[npos - 1]))
00343                     PRSSYNTAXERROR;
00344                 WEP_SETWEIGHT(pos[npos - 1], 1);
00345             }
00346             else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
00347             {
00348                 if (WEP_GETWEIGHT(pos[npos - 1]))
00349                     PRSSYNTAXERROR;
00350                 WEP_SETWEIGHT(pos[npos - 1], 0);
00351             }
00352             else if (t_isspace(state->prsbuf) ||
00353                      *(state->prsbuf) == '\0')
00354                 RETURN_TOKEN;
00355             else if (!t_isdigit(state->prsbuf))
00356                 PRSSYNTAXERROR;
00357         }
00358         else    /* internal error */
00359             elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
00360                  statecode);
00361 
00362         /* get next char */
00363         state->prsbuf += pg_mblen(state->prsbuf);
00364     }
00365 }