00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include "postgres.h"
00016
00017 #include "tsearch/ts_locale.h"
00018 #include "tsearch/ts_utils.h"
00019
00020
00021
00022
00023
00024
00025
00026
00027 struct TSVectorParseStateData
00028 {
00029 char *prsbuf;
00030 char *bufstart;
00031 char *word;
00032 int len;
00033 int eml;
00034 bool oprisdelim;
00035 bool is_tsquery;
00036 };
00037
00038
00039
00040
00041
00042
00043
00044 TSVectorParseState
00045 init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
00046 {
00047 TSVectorParseState state;
00048
00049 state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
00050 state->prsbuf = input;
00051 state->bufstart = input;
00052 state->len = 32;
00053 state->word = (char *) palloc(state->len);
00054 state->eml = pg_database_encoding_max_length();
00055 state->oprisdelim = oprisdelim;
00056 state->is_tsquery = is_tsquery;
00057
00058 return state;
00059 }
00060
00061
00062
00063
00064 void
00065 reset_tsvector_parser(TSVectorParseState state, char *input)
00066 {
00067 state->prsbuf = input;
00068 }
00069
00070
00071
00072
00073 void
00074 close_tsvector_parser(TSVectorParseState state)
00075 {
00076 pfree(state->word);
00077 pfree(state);
00078 }
00079
00080
00081 #define RESIZEPRSBUF \
00082 do { \
00083 int clen = curpos - state->word; \
00084 if ( clen + state->eml >= state->len ) \
00085 { \
00086 state->len *= 2; \
00087 state->word = (char *) repalloc(state->word, state->len); \
00088 curpos = state->word + clen; \
00089 } \
00090 } while (0)
00091
00092 #define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
00093
00094
00095 #define RETURN_TOKEN \
00096 do { \
00097 if (pos_ptr != NULL) \
00098 { \
00099 *pos_ptr = pos; \
00100 *poslen = npos; \
00101 } \
00102 else if (pos != NULL) \
00103 pfree(pos); \
00104 \
00105 if (strval != NULL) \
00106 *strval = state->word; \
00107 if (lenval != NULL) \
00108 *lenval = curpos - state->word; \
00109 if (endptr != NULL) \
00110 *endptr = state->prsbuf; \
00111 return true; \
00112 } while(0)
00113
00114
00115
00116 #define WAITWORD 1
00117 #define WAITENDWORD 2
00118 #define WAITNEXTCHAR 3
00119 #define WAITENDCMPLX 4
00120 #define WAITPOSINFO 5
00121 #define INPOSINFO 6
00122 #define WAITPOSDELIM 7
00123 #define WAITCHARCMPLX 8
00124
00125 #define PRSSYNTAXERROR prssyntaxerror(state)
00126
00127 static void
00128 prssyntaxerror(TSVectorParseState state)
00129 {
00130 ereport(ERROR,
00131 (errcode(ERRCODE_SYNTAX_ERROR),
00132 state->is_tsquery ?
00133 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
00134 errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
00135 }
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154 bool
00155 gettoken_tsvector(TSVectorParseState state,
00156 char **strval, int *lenval,
00157 WordEntryPos **pos_ptr, int *poslen,
00158 char **endptr)
00159 {
00160 int oldstate = 0;
00161 char *curpos = state->word;
00162 int statecode = WAITWORD;
00163
00164
00165
00166
00167
00168 WordEntryPos *pos = NULL;
00169 int npos = 0;
00170 int posalen = 0;
00171
00172 while (1)
00173 {
00174 if (statecode == WAITWORD)
00175 {
00176 if (*(state->prsbuf) == '\0')
00177 return false;
00178 else if (t_iseq(state->prsbuf, '\''))
00179 statecode = WAITENDCMPLX;
00180 else if (t_iseq(state->prsbuf, '\\'))
00181 {
00182 statecode = WAITNEXTCHAR;
00183 oldstate = WAITENDWORD;
00184 }
00185 else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
00186 PRSSYNTAXERROR;
00187 else if (!t_isspace(state->prsbuf))
00188 {
00189 COPYCHAR(curpos, state->prsbuf);
00190 curpos += pg_mblen(state->prsbuf);
00191 statecode = WAITENDWORD;
00192 }
00193 }
00194 else if (statecode == WAITNEXTCHAR)
00195 {
00196 if (*(state->prsbuf) == '\0')
00197 ereport(ERROR,
00198 (errcode(ERRCODE_SYNTAX_ERROR),
00199 errmsg("there is no escaped character: \"%s\"",
00200 state->bufstart)));
00201 else
00202 {
00203 RESIZEPRSBUF;
00204 COPYCHAR(curpos, state->prsbuf);
00205 curpos += pg_mblen(state->prsbuf);
00206 Assert(oldstate != 0);
00207 statecode = oldstate;
00208 }
00209 }
00210 else if (statecode == WAITENDWORD)
00211 {
00212 if (t_iseq(state->prsbuf, '\\'))
00213 {
00214 statecode = WAITNEXTCHAR;
00215 oldstate = WAITENDWORD;
00216 }
00217 else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
00218 (state->oprisdelim && ISOPERATOR(state->prsbuf)))
00219 {
00220 RESIZEPRSBUF;
00221 if (curpos == state->word)
00222 PRSSYNTAXERROR;
00223 *(curpos) = '\0';
00224 RETURN_TOKEN;
00225 }
00226 else if (t_iseq(state->prsbuf, ':'))
00227 {
00228 if (curpos == state->word)
00229 PRSSYNTAXERROR;
00230 *(curpos) = '\0';
00231 if (state->oprisdelim)
00232 RETURN_TOKEN;
00233 else
00234 statecode = INPOSINFO;
00235 }
00236 else
00237 {
00238 RESIZEPRSBUF;
00239 COPYCHAR(curpos, state->prsbuf);
00240 curpos += pg_mblen(state->prsbuf);
00241 }
00242 }
00243 else if (statecode == WAITENDCMPLX)
00244 {
00245 if (t_iseq(state->prsbuf, '\''))
00246 {
00247 statecode = WAITCHARCMPLX;
00248 }
00249 else if (t_iseq(state->prsbuf, '\\'))
00250 {
00251 statecode = WAITNEXTCHAR;
00252 oldstate = WAITENDCMPLX;
00253 }
00254 else if (*(state->prsbuf) == '\0')
00255 PRSSYNTAXERROR;
00256 else
00257 {
00258 RESIZEPRSBUF;
00259 COPYCHAR(curpos, state->prsbuf);
00260 curpos += pg_mblen(state->prsbuf);
00261 }
00262 }
00263 else if (statecode == WAITCHARCMPLX)
00264 {
00265 if (t_iseq(state->prsbuf, '\''))
00266 {
00267 RESIZEPRSBUF;
00268 COPYCHAR(curpos, state->prsbuf);
00269 curpos += pg_mblen(state->prsbuf);
00270 statecode = WAITENDCMPLX;
00271 }
00272 else
00273 {
00274 RESIZEPRSBUF;
00275 *(curpos) = '\0';
00276 if (curpos == state->word)
00277 PRSSYNTAXERROR;
00278 if (state->oprisdelim)
00279 {
00280
00281 RETURN_TOKEN;
00282 }
00283 else
00284 statecode = WAITPOSINFO;
00285 continue;
00286 }
00287 }
00288 else if (statecode == WAITPOSINFO)
00289 {
00290 if (t_iseq(state->prsbuf, ':'))
00291 statecode = INPOSINFO;
00292 else
00293 RETURN_TOKEN;
00294 }
00295 else if (statecode == INPOSINFO)
00296 {
00297 if (t_isdigit(state->prsbuf))
00298 {
00299 if (posalen == 0)
00300 {
00301 posalen = 4;
00302 pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
00303 npos = 0;
00304 }
00305 else if (npos + 1 >= posalen)
00306 {
00307 posalen *= 2;
00308 pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
00309 }
00310 npos++;
00311 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
00312
00313 if (WEP_GETPOS(pos[npos - 1]) == 0)
00314 ereport(ERROR,
00315 (errcode(ERRCODE_SYNTAX_ERROR),
00316 errmsg("wrong position info in tsvector: \"%s\"",
00317 state->bufstart)));
00318 WEP_SETWEIGHT(pos[npos - 1], 0);
00319 statecode = WAITPOSDELIM;
00320 }
00321 else
00322 PRSSYNTAXERROR;
00323 }
00324 else if (statecode == WAITPOSDELIM)
00325 {
00326 if (t_iseq(state->prsbuf, ','))
00327 statecode = INPOSINFO;
00328 else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
00329 {
00330 if (WEP_GETWEIGHT(pos[npos - 1]))
00331 PRSSYNTAXERROR;
00332 WEP_SETWEIGHT(pos[npos - 1], 3);
00333 }
00334 else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
00335 {
00336 if (WEP_GETWEIGHT(pos[npos - 1]))
00337 PRSSYNTAXERROR;
00338 WEP_SETWEIGHT(pos[npos - 1], 2);
00339 }
00340 else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
00341 {
00342 if (WEP_GETWEIGHT(pos[npos - 1]))
00343 PRSSYNTAXERROR;
00344 WEP_SETWEIGHT(pos[npos - 1], 1);
00345 }
00346 else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
00347 {
00348 if (WEP_GETWEIGHT(pos[npos - 1]))
00349 PRSSYNTAXERROR;
00350 WEP_SETWEIGHT(pos[npos - 1], 0);
00351 }
00352 else if (t_isspace(state->prsbuf) ||
00353 *(state->prsbuf) == '\0')
00354 RETURN_TOKEN;
00355 else if (!t_isdigit(state->prsbuf))
00356 PRSSYNTAXERROR;
00357 }
00358 else
00359 elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
00360 statecode);
00361
00362
00363 state->prsbuf += pg_mblen(state->prsbuf);
00364 }
00365 }