Header And Logo

PostgreSQL
| The world's most advanced open source database.

wparser_def.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * wparser_def.c
00004  *      Default text search parser
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  *
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/tsearch/wparser_def.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 
00015 #include "postgres.h"
00016 
00017 #include "catalog/pg_collation.h"
00018 #include "commands/defrem.h"
00019 #include "tsearch/ts_locale.h"
00020 #include "tsearch/ts_public.h"
00021 #include "tsearch/ts_type.h"
00022 #include "tsearch/ts_utils.h"
00023 #include "utils/builtins.h"
00024 
00025 
00026 /* Define me to enable tracing of parser behavior */
00027 /* #define WPARSER_TRACE */
00028 
00029 
00030 /* Output token categories */
00031 
00032 #define ASCIIWORD       1
00033 #define WORD_T          2
00034 #define NUMWORD         3
00035 #define EMAIL           4
00036 #define URL_T           5
00037 #define HOST            6
00038 #define SCIENTIFIC      7
00039 #define VERSIONNUMBER   8
00040 #define NUMPARTHWORD    9
00041 #define PARTHWORD       10
00042 #define ASCIIPARTHWORD  11
00043 #define SPACE           12
00044 #define TAG_T           13
00045 #define PROTOCOL        14
00046 #define NUMHWORD        15
00047 #define ASCIIHWORD      16
00048 #define HWORD           17
00049 #define URLPATH         18
00050 #define FILEPATH        19
00051 #define DECIMAL_T       20
00052 #define SIGNEDINT       21
00053 #define UNSIGNEDINT     22
00054 #define XMLENTITY       23
00055 
00056 #define LASTNUM         23
00057 
00058 static const char *const tok_alias[] = {
00059     "",
00060     "asciiword",
00061     "word",
00062     "numword",
00063     "email",
00064     "url",
00065     "host",
00066     "sfloat",
00067     "version",
00068     "hword_numpart",
00069     "hword_part",
00070     "hword_asciipart",
00071     "blank",
00072     "tag",
00073     "protocol",
00074     "numhword",
00075     "asciihword",
00076     "hword",
00077     "url_path",
00078     "file",
00079     "float",
00080     "int",
00081     "uint",
00082     "entity"
00083 };
00084 
00085 static const char *const lex_descr[] = {
00086     "",
00087     "Word, all ASCII",
00088     "Word, all letters",
00089     "Word, letters and digits",
00090     "Email address",
00091     "URL",
00092     "Host",
00093     "Scientific notation",
00094     "Version number",
00095     "Hyphenated word part, letters and digits",
00096     "Hyphenated word part, all letters",
00097     "Hyphenated word part, all ASCII",
00098     "Space symbols",
00099     "XML tag",
00100     "Protocol head",
00101     "Hyphenated word, letters and digits",
00102     "Hyphenated word, all ASCII",
00103     "Hyphenated word, all letters",
00104     "URL path",
00105     "File or path name",
00106     "Decimal notation",
00107     "Signed integer",
00108     "Unsigned integer",
00109     "XML entity"
00110 };
00111 
00112 
00113 /* Parser states */
00114 
00115 typedef enum
00116 {
00117     TPS_Base = 0,
00118     TPS_InNumWord,
00119     TPS_InAsciiWord,
00120     TPS_InWord,
00121     TPS_InUnsignedInt,
00122     TPS_InSignedIntFirst,
00123     TPS_InSignedInt,
00124     TPS_InSpace,
00125     TPS_InUDecimalFirst,
00126     TPS_InUDecimal,
00127     TPS_InDecimalFirst,
00128     TPS_InDecimal,
00129     TPS_InVerVersion,
00130     TPS_InSVerVersion,
00131     TPS_InVersionFirst,
00132     TPS_InVersion,
00133     TPS_InMantissaFirst,
00134     TPS_InMantissaSign,
00135     TPS_InMantissa,
00136     TPS_InXMLEntityFirst,
00137     TPS_InXMLEntity,
00138     TPS_InXMLEntityNumFirst,
00139     TPS_InXMLEntityNum,
00140     TPS_InXMLEntityHexNumFirst,
00141     TPS_InXMLEntityHexNum,
00142     TPS_InXMLEntityEnd,
00143     TPS_InTagFirst,
00144     TPS_InXMLBegin,
00145     TPS_InTagCloseFirst,
00146     TPS_InTagName,
00147     TPS_InTagBeginEnd,
00148     TPS_InTag,
00149     TPS_InTagEscapeK,
00150     TPS_InTagEscapeKK,
00151     TPS_InTagBackSleshed,
00152     TPS_InTagEnd,
00153     TPS_InCommentFirst,
00154     TPS_InCommentLast,
00155     TPS_InComment,
00156     TPS_InCloseCommentFirst,
00157     TPS_InCloseCommentLast,
00158     TPS_InCommentEnd,
00159     TPS_InHostFirstDomain,
00160     TPS_InHostDomainSecond,
00161     TPS_InHostDomain,
00162     TPS_InPortFirst,
00163     TPS_InPort,
00164     TPS_InHostFirstAN,
00165     TPS_InHost,
00166     TPS_InEmail,
00167     TPS_InFileFirst,
00168     TPS_InFileTwiddle,
00169     TPS_InPathFirst,
00170     TPS_InPathFirstFirst,
00171     TPS_InPathSecond,
00172     TPS_InFile,
00173     TPS_InFileNext,
00174     TPS_InURLPathFirst,
00175     TPS_InURLPathStart,
00176     TPS_InURLPath,
00177     TPS_InFURL,
00178     TPS_InProtocolFirst,
00179     TPS_InProtocolSecond,
00180     TPS_InProtocolEnd,
00181     TPS_InHyphenAsciiWordFirst,
00182     TPS_InHyphenAsciiWord,
00183     TPS_InHyphenWordFirst,
00184     TPS_InHyphenWord,
00185     TPS_InHyphenNumWordFirst,
00186     TPS_InHyphenNumWord,
00187     TPS_InHyphenDigitLookahead,
00188     TPS_InParseHyphen,
00189     TPS_InParseHyphenHyphen,
00190     TPS_InHyphenWordPart,
00191     TPS_InHyphenAsciiWordPart,
00192     TPS_InHyphenNumWordPart,
00193     TPS_InHyphenUnsignedInt,
00194     TPS_Null                    /* last state (fake value) */
00195 } TParserState;
00196 
00197 /* forward declaration */
00198 struct TParser;
00199 
00200 typedef int (*TParserCharTest) (struct TParser *);      /* any p_is* functions
00201                                                          * except p_iseq */
00202 typedef void (*TParserSpecial) (struct TParser *);      /* special handler for
00203                                                          * special cases... */
00204 
00205 typedef struct
00206 {
00207     TParserCharTest isclass;
00208     char        c;
00209     uint16      flags;
00210     TParserState tostate;
00211     int         type;
00212     TParserSpecial special;
00213 } TParserStateActionItem;
00214 
00215 /* Flag bits in TParserStateActionItem.flags */
00216 #define A_NEXT      0x0000
00217 #define A_BINGO     0x0001
00218 #define A_POP       0x0002
00219 #define A_PUSH      0x0004
00220 #define A_RERUN     0x0008
00221 #define A_CLEAR     0x0010
00222 #define A_MERGE     0x0020
00223 #define A_CLRALL    0x0040
00224 
00225 typedef struct TParserPosition
00226 {
00227     int         posbyte;        /* position of parser in bytes */
00228     int         poschar;        /* position of parser in characters */
00229     int         charlen;        /* length of current char */
00230     int         lenbytetoken;   /* length of token-so-far in bytes */
00231     int         lenchartoken;   /* and in chars */
00232     TParserState state;
00233     struct TParserPosition *prev;
00234     const TParserStateActionItem *pushedAtAction;
00235 } TParserPosition;
00236 
00237 typedef struct TParser
00238 {
00239     /* string and position information */
00240     char       *str;            /* multibyte string */
00241     int         lenstr;         /* length of mbstring */
00242 #ifdef USE_WIDE_UPPER_LOWER
00243     wchar_t    *wstr;           /* wide character string */
00244     pg_wchar   *pgwstr;         /* wide character string for C-locale */
00245     bool        usewide;
00246 #endif
00247 
00248     /* State of parse */
00249     int         charmaxlen;
00250     TParserPosition *state;
00251     bool        ignore;
00252     bool        wanthost;
00253 
00254     /* silly char */
00255     char        c;
00256 
00257     /* out */
00258     char       *token;
00259     int         lenbytetoken;
00260     int         lenchartoken;
00261     int         type;
00262 } TParser;
00263 
00264 
00265 /* forward decls here */
00266 static bool TParserGet(TParser *prs);
00267 
00268 
00269 static TParserPosition *
00270 newTParserPosition(TParserPosition *prev)
00271 {
00272     TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
00273 
00274     if (prev)
00275         memcpy(res, prev, sizeof(TParserPosition));
00276     else
00277         memset(res, 0, sizeof(TParserPosition));
00278 
00279     res->prev = prev;
00280 
00281     res->pushedAtAction = NULL;
00282 
00283     return res;
00284 }
00285 
00286 static TParser *
00287 TParserInit(char *str, int len)
00288 {
00289     TParser    *prs = (TParser *) palloc0(sizeof(TParser));
00290 
00291     prs->charmaxlen = pg_database_encoding_max_length();
00292     prs->str = str;
00293     prs->lenstr = len;
00294 
00295 #ifdef USE_WIDE_UPPER_LOWER
00296 
00297     /*
00298      * Use wide char code only when max encoding length > 1.
00299      */
00300     if (prs->charmaxlen > 1)
00301     {
00302         Oid         collation = DEFAULT_COLLATION_OID;  /* TODO */
00303         pg_locale_t mylocale = 0;       /* TODO */
00304 
00305         prs->usewide = true;
00306         if (lc_ctype_is_c(collation))
00307         {
00308             /*
00309              * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
00310              * be different from sizeof(wchar_t)
00311              */
00312             prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
00313             pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
00314         }
00315         else
00316         {
00317             prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
00318             char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
00319                        mylocale);
00320         }
00321     }
00322     else
00323         prs->usewide = false;
00324 #endif
00325 
00326     prs->state = newTParserPosition(NULL);
00327     prs->state->state = TPS_Base;
00328 
00329 #ifdef WPARSER_TRACE
00330 
00331     /*
00332      * Use of %.*s here is a bit risky since it can misbehave if the data is
00333      * not in what libc thinks is the prevailing encoding.  However, since
00334      * this is just a debugging aid, we choose to live with that.
00335      */
00336     fprintf(stderr, "parsing \"%.*s\"\n", len, str);
00337 #endif
00338 
00339     return prs;
00340 }
00341 
00342 /*
00343  * As an alternative to a full TParserInit one can create a
00344  * TParserCopy which basically is a regular TParser without a private
00345  * copy of the string - instead it uses the one from another TParser.
00346  * This is useful because at some places TParsers are created
00347  * recursively and the repeated copying around of the strings can
00348  * cause major inefficiency if the source string is long.
00349  * The new parser starts parsing at the original's current position.
00350  *
00351  * Obviously one must not close the original TParser before the copy.
00352  */
00353 static TParser *
00354 TParserCopyInit(const TParser *orig)
00355 {
00356     TParser    *prs = (TParser *) palloc0(sizeof(TParser));
00357 
00358     prs->charmaxlen = orig->charmaxlen;
00359     prs->str = orig->str + orig->state->posbyte;
00360     prs->lenstr = orig->lenstr - orig->state->posbyte;
00361 
00362 #ifdef USE_WIDE_UPPER_LOWER
00363     prs->usewide = orig->usewide;
00364 
00365     if (orig->pgwstr)
00366         prs->pgwstr = orig->pgwstr + orig->state->poschar;
00367     if (orig->wstr)
00368         prs->wstr = orig->wstr + orig->state->poschar;
00369 #endif
00370 
00371     prs->state = newTParserPosition(NULL);
00372     prs->state->state = TPS_Base;
00373 
00374 #ifdef WPARSER_TRACE
00375     /* See note above about %.*s */
00376     fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
00377 #endif
00378 
00379     return prs;
00380 }
00381 
00382 
00383 static void
00384 TParserClose(TParser *prs)
00385 {
00386     while (prs->state)
00387     {
00388         TParserPosition *ptr = prs->state->prev;
00389 
00390         pfree(prs->state);
00391         prs->state = ptr;
00392     }
00393 
00394 #ifdef USE_WIDE_UPPER_LOWER
00395     if (prs->wstr)
00396         pfree(prs->wstr);
00397     if (prs->pgwstr)
00398         pfree(prs->pgwstr);
00399 #endif
00400 
00401 #ifdef WPARSER_TRACE
00402     fprintf(stderr, "closing parser\n");
00403 #endif
00404     pfree(prs);
00405 }
00406 
00407 /*
00408  * Close a parser created with TParserCopyInit
00409  */
00410 static void
00411 TParserCopyClose(TParser *prs)
00412 {
00413     while (prs->state)
00414     {
00415         TParserPosition *ptr = prs->state->prev;
00416 
00417         pfree(prs->state);
00418         prs->state = ptr;
00419     }
00420 
00421 #ifdef WPARSER_TRACE
00422     fprintf(stderr, "closing parser copy\n");
00423 #endif
00424     pfree(prs);
00425 }
00426 
00427 
00428 /*
00429  * Character-type support functions, equivalent to is* macros, but
00430  * working with any possible encodings and locales. Notes:
00431  *  - with multibyte encoding and C-locale isw* function may fail
00432  *    or give wrong result.
00433  *  - multibyte encoding and C-locale often are used for
00434  *    Asian languages.
00435  *  - if locale is C the we use pgwstr instead of wstr
00436  */
00437 
00438 #ifdef USE_WIDE_UPPER_LOWER
00439 
00440 #define p_iswhat(type)                                                      \
00441 static int                                                                  \
00442 p_is##type(TParser *prs) {                                                  \
00443     Assert( prs->state );                                                   \
00444     if ( prs->usewide )                                                     \
00445     {                                                                       \
00446         if ( prs->pgwstr )                                                  \
00447             return is##type( 0xff & *( prs->pgwstr + prs->state->poschar) );\
00448                                                                             \
00449         return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );  \
00450     }                                                                       \
00451                                                                             \
00452     return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
00453 }   \
00454                                                                             \
00455 static int                                                                  \
00456 p_isnot##type(TParser *prs) {                                               \
00457     return !p_is##type(prs);                                                \
00458 }
00459 
00460 static int
00461 p_isalnum(TParser *prs)
00462 {
00463     Assert(prs->state);
00464 
00465     if (prs->usewide)
00466     {
00467         if (prs->pgwstr)
00468         {
00469             unsigned int c = *(prs->pgwstr + prs->state->poschar);
00470 
00471             /*
00472              * any non-ascii symbol with multibyte encoding with C-locale is
00473              * an alpha character
00474              */
00475             if (c > 0x7f)
00476                 return 1;
00477 
00478             return isalnum(0xff & c);
00479         }
00480 
00481         return iswalnum((wint_t) *(prs->wstr + prs->state->poschar));
00482     }
00483 
00484     return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
00485 }
00486 static int
00487 p_isnotalnum(TParser *prs)
00488 {
00489     return !p_isalnum(prs);
00490 }
00491 
00492 static int
00493 p_isalpha(TParser *prs)
00494 {
00495     Assert(prs->state);
00496 
00497     if (prs->usewide)
00498     {
00499         if (prs->pgwstr)
00500         {
00501             unsigned int c = *(prs->pgwstr + prs->state->poschar);
00502 
00503             /*
00504              * any non-ascii symbol with multibyte encoding with C-locale is
00505              * an alpha character
00506              */
00507             if (c > 0x7f)
00508                 return 1;
00509 
00510             return isalpha(0xff & c);
00511         }
00512 
00513         return iswalpha((wint_t) *(prs->wstr + prs->state->poschar));
00514     }
00515 
00516     return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
00517 }
00518 
00519 static int
00520 p_isnotalpha(TParser *prs)
00521 {
00522     return !p_isalpha(prs);
00523 }
00524 
00525 /* p_iseq should be used only for ascii symbols */
00526 
00527 static int
00528 p_iseq(TParser *prs, char c)
00529 {
00530     Assert(prs->state);
00531     return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
00532 }
00533 #else                           /* USE_WIDE_UPPER_LOWER */
00534 
00535 #define p_iswhat(type)                                                      \
00536 static int                                                                  \
00537 p_is##type(TParser *prs) {                                                  \
00538     Assert( prs->state );                                                   \
00539     return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );  \
00540 }   \
00541                                                                             \
00542 static int                                                                  \
00543 p_isnot##type(TParser *prs) {                                               \
00544     return !p_is##type(prs);                                                \
00545 }
00546 
00547 
00548 static int
00549 p_iseq(TParser *prs, char c)
00550 {
00551     Assert(prs->state);
00552     return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
00553 }
00554 
00555 p_iswhat(alnum)
00556 p_iswhat(alpha)
00557 #endif   /* USE_WIDE_UPPER_LOWER */
00558 
00559 p_iswhat(digit)
00560 p_iswhat(lower)
00561 p_iswhat(print)
00562 p_iswhat(punct)
00563 p_iswhat(space)
00564 p_iswhat(upper)
00565 p_iswhat(xdigit)
00566 
00567 static int
00568 p_isEOF(TParser *prs)
00569 {
00570     Assert(prs->state);
00571     return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
00572 }
00573 
00574 static int
00575 p_iseqC(TParser *prs)
00576 {
00577     return p_iseq(prs, prs->c);
00578 }
00579 
00580 static int
00581 p_isneC(TParser *prs)
00582 {
00583     return !p_iseq(prs, prs->c);
00584 }
00585 
00586 static int
00587 p_isascii(TParser *prs)
00588 {
00589     return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
00590 }
00591 
00592 static int
00593 p_isasclet(TParser *prs)
00594 {
00595     return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
00596 }
00597 
00598 static int
00599 p_isurlchar(TParser *prs)
00600 {
00601     char        ch;
00602 
00603     /* no non-ASCII need apply */
00604     if (prs->state->charlen != 1)
00605         return 0;
00606     ch = *(prs->str + prs->state->posbyte);
00607     /* no spaces or control characters */
00608     if (ch <= 0x20 || ch >= 0x7F)
00609         return 0;
00610     /* reject characters disallowed by RFC 3986 */
00611     switch (ch)
00612     {
00613         case '"':
00614         case '<':
00615         case '>':
00616         case '\\':
00617         case '^':
00618         case '`':
00619         case '{':
00620         case '|':
00621         case '}':
00622             return 0;
00623     }
00624     return 1;
00625 }
00626 
00627 
00628 /* deliberately suppress unused-function complaints for the above */
00629 void        _make_compiler_happy(void);
00630 void
00631 _make_compiler_happy(void)
00632 {
00633     p_isalnum(NULL);
00634     p_isnotalnum(NULL);
00635     p_isalpha(NULL);
00636     p_isnotalpha(NULL);
00637     p_isdigit(NULL);
00638     p_isnotdigit(NULL);
00639     p_islower(NULL);
00640     p_isnotlower(NULL);
00641     p_isprint(NULL);
00642     p_isnotprint(NULL);
00643     p_ispunct(NULL);
00644     p_isnotpunct(NULL);
00645     p_isspace(NULL);
00646     p_isnotspace(NULL);
00647     p_isupper(NULL);
00648     p_isnotupper(NULL);
00649     p_isxdigit(NULL);
00650     p_isnotxdigit(NULL);
00651     p_isEOF(NULL);
00652     p_iseqC(NULL);
00653     p_isneC(NULL);
00654 }
00655 
00656 
00657 static void
00658 SpecialTags(TParser *prs)
00659 {
00660     switch (prs->state->lenchartoken)
00661     {
00662         case 8:         /* </script */
00663             if (pg_strncasecmp(prs->token, "</script", 8) == 0)
00664                 prs->ignore = false;
00665             break;
00666         case 7:         /* <script || </style */
00667             if (pg_strncasecmp(prs->token, "</style", 7) == 0)
00668                 prs->ignore = false;
00669             else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
00670                 prs->ignore = true;
00671             break;
00672         case 6:         /* <style */
00673             if (pg_strncasecmp(prs->token, "<style", 6) == 0)
00674                 prs->ignore = true;
00675             break;
00676         default:
00677             break;
00678     }
00679 }
00680 
00681 static void
00682 SpecialFURL(TParser *prs)
00683 {
00684     prs->wanthost = true;
00685     prs->state->posbyte -= prs->state->lenbytetoken;
00686     prs->state->poschar -= prs->state->lenchartoken;
00687 }
00688 
00689 static void
00690 SpecialHyphen(TParser *prs)
00691 {
00692     prs->state->posbyte -= prs->state->lenbytetoken;
00693     prs->state->poschar -= prs->state->lenchartoken;
00694 }
00695 
00696 static void
00697 SpecialVerVersion(TParser *prs)
00698 {
00699     prs->state->posbyte -= prs->state->lenbytetoken;
00700     prs->state->poschar -= prs->state->lenchartoken;
00701     prs->state->lenbytetoken = 0;
00702     prs->state->lenchartoken = 0;
00703 }
00704 
00705 static int
00706 p_isstophost(TParser *prs)
00707 {
00708     if (prs->wanthost)
00709     {
00710         prs->wanthost = false;
00711         return 1;
00712     }
00713     return 0;
00714 }
00715 
00716 static int
00717 p_isignore(TParser *prs)
00718 {
00719     return (prs->ignore) ? 1 : 0;
00720 }
00721 
00722 static int
00723 p_ishost(TParser *prs)
00724 {
00725     TParser    *tmpprs = TParserCopyInit(prs);
00726     int         res = 0;
00727 
00728     tmpprs->wanthost = true;
00729 
00730     if (TParserGet(tmpprs) && tmpprs->type == HOST)
00731     {
00732         prs->state->posbyte += tmpprs->lenbytetoken;
00733         prs->state->poschar += tmpprs->lenchartoken;
00734         prs->state->lenbytetoken += tmpprs->lenbytetoken;
00735         prs->state->lenchartoken += tmpprs->lenchartoken;
00736         prs->state->charlen = tmpprs->state->charlen;
00737         res = 1;
00738     }
00739     TParserCopyClose(tmpprs);
00740 
00741     return res;
00742 }
00743 
00744 static int
00745 p_isURLPath(TParser *prs)
00746 {
00747     TParser    *tmpprs = TParserCopyInit(prs);
00748     int         res = 0;
00749 
00750     tmpprs->state = newTParserPosition(tmpprs->state);
00751     tmpprs->state->state = TPS_InURLPathFirst;
00752 
00753     if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
00754     {
00755         prs->state->posbyte += tmpprs->lenbytetoken;
00756         prs->state->poschar += tmpprs->lenchartoken;
00757         prs->state->lenbytetoken += tmpprs->lenbytetoken;
00758         prs->state->lenchartoken += tmpprs->lenchartoken;
00759         prs->state->charlen = tmpprs->state->charlen;
00760         res = 1;
00761     }
00762     TParserCopyClose(tmpprs);
00763 
00764     return res;
00765 }
00766 
00767 /*
00768  * returns true if current character has zero display length or
00769  * it's a special sign in several languages. Such characters
00770  * aren't a word-breaker although they aren't an isalpha.
00771  * In beginning of word they aren't a part of it.
00772  */
00773 static int
00774 p_isspecial(TParser *prs)
00775 {
00776     /*
00777      * pg_dsplen could return -1 which means error or control character
00778      */
00779     if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
00780         return 1;
00781 
00782 #ifdef USE_WIDE_UPPER_LOWER
00783 
00784     /*
00785      * Unicode Characters in the 'Mark, Spacing Combining' Category That
00786      * characters are not alpha although they are not breakers of word too.
00787      * Check that only in utf encoding, because other encodings aren't
00788      * supported by postgres or even exists.
00789      */
00790     if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
00791     {
00792         static pg_wchar strange_letter[] = {
00793             /*
00794              * use binary search, so elements should be ordered
00795              */
00796             0x0903,             /* DEVANAGARI SIGN VISARGA */
00797             0x093E,             /* DEVANAGARI VOWEL SIGN AA */
00798             0x093F,             /* DEVANAGARI VOWEL SIGN I */
00799             0x0940,             /* DEVANAGARI VOWEL SIGN II */
00800             0x0949,             /* DEVANAGARI VOWEL SIGN CANDRA O */
00801             0x094A,             /* DEVANAGARI VOWEL SIGN SHORT O */
00802             0x094B,             /* DEVANAGARI VOWEL SIGN O */
00803             0x094C,             /* DEVANAGARI VOWEL SIGN AU */
00804             0x0982,             /* BENGALI SIGN ANUSVARA */
00805             0x0983,             /* BENGALI SIGN VISARGA */
00806             0x09BE,             /* BENGALI VOWEL SIGN AA */
00807             0x09BF,             /* BENGALI VOWEL SIGN I */
00808             0x09C0,             /* BENGALI VOWEL SIGN II */
00809             0x09C7,             /* BENGALI VOWEL SIGN E */
00810             0x09C8,             /* BENGALI VOWEL SIGN AI */
00811             0x09CB,             /* BENGALI VOWEL SIGN O */
00812             0x09CC,             /* BENGALI VOWEL SIGN AU */
00813             0x09D7,             /* BENGALI AU LENGTH MARK */
00814             0x0A03,             /* GURMUKHI SIGN VISARGA */
00815             0x0A3E,             /* GURMUKHI VOWEL SIGN AA */
00816             0x0A3F,             /* GURMUKHI VOWEL SIGN I */
00817             0x0A40,             /* GURMUKHI VOWEL SIGN II */
00818             0x0A83,             /* GUJARATI SIGN VISARGA */
00819             0x0ABE,             /* GUJARATI VOWEL SIGN AA */
00820             0x0ABF,             /* GUJARATI VOWEL SIGN I */
00821             0x0AC0,             /* GUJARATI VOWEL SIGN II */
00822             0x0AC9,             /* GUJARATI VOWEL SIGN CANDRA O */
00823             0x0ACB,             /* GUJARATI VOWEL SIGN O */
00824             0x0ACC,             /* GUJARATI VOWEL SIGN AU */
00825             0x0B02,             /* ORIYA SIGN ANUSVARA */
00826             0x0B03,             /* ORIYA SIGN VISARGA */
00827             0x0B3E,             /* ORIYA VOWEL SIGN AA */
00828             0x0B40,             /* ORIYA VOWEL SIGN II */
00829             0x0B47,             /* ORIYA VOWEL SIGN E */
00830             0x0B48,             /* ORIYA VOWEL SIGN AI */
00831             0x0B4B,             /* ORIYA VOWEL SIGN O */
00832             0x0B4C,             /* ORIYA VOWEL SIGN AU */
00833             0x0B57,             /* ORIYA AU LENGTH MARK */
00834             0x0BBE,             /* TAMIL VOWEL SIGN AA */
00835             0x0BBF,             /* TAMIL VOWEL SIGN I */
00836             0x0BC1,             /* TAMIL VOWEL SIGN U */
00837             0x0BC2,             /* TAMIL VOWEL SIGN UU */
00838             0x0BC6,             /* TAMIL VOWEL SIGN E */
00839             0x0BC7,             /* TAMIL VOWEL SIGN EE */
00840             0x0BC8,             /* TAMIL VOWEL SIGN AI */
00841             0x0BCA,             /* TAMIL VOWEL SIGN O */
00842             0x0BCB,             /* TAMIL VOWEL SIGN OO */
00843             0x0BCC,             /* TAMIL VOWEL SIGN AU */
00844             0x0BD7,             /* TAMIL AU LENGTH MARK */
00845             0x0C01,             /* TELUGU SIGN CANDRABINDU */
00846             0x0C02,             /* TELUGU SIGN ANUSVARA */
00847             0x0C03,             /* TELUGU SIGN VISARGA */
00848             0x0C41,             /* TELUGU VOWEL SIGN U */
00849             0x0C42,             /* TELUGU VOWEL SIGN UU */
00850             0x0C43,             /* TELUGU VOWEL SIGN VOCALIC R */
00851             0x0C44,             /* TELUGU VOWEL SIGN VOCALIC RR */
00852             0x0C82,             /* KANNADA SIGN ANUSVARA */
00853             0x0C83,             /* KANNADA SIGN VISARGA */
00854             0x0CBE,             /* KANNADA VOWEL SIGN AA */
00855             0x0CC0,             /* KANNADA VOWEL SIGN II */
00856             0x0CC1,             /* KANNADA VOWEL SIGN U */
00857             0x0CC2,             /* KANNADA VOWEL SIGN UU */
00858             0x0CC3,             /* KANNADA VOWEL SIGN VOCALIC R */
00859             0x0CC4,             /* KANNADA VOWEL SIGN VOCALIC RR */
00860             0x0CC7,             /* KANNADA VOWEL SIGN EE */
00861             0x0CC8,             /* KANNADA VOWEL SIGN AI */
00862             0x0CCA,             /* KANNADA VOWEL SIGN O */
00863             0x0CCB,             /* KANNADA VOWEL SIGN OO */
00864             0x0CD5,             /* KANNADA LENGTH MARK */
00865             0x0CD6,             /* KANNADA AI LENGTH MARK */
00866             0x0D02,             /* MALAYALAM SIGN ANUSVARA */
00867             0x0D03,             /* MALAYALAM SIGN VISARGA */
00868             0x0D3E,             /* MALAYALAM VOWEL SIGN AA */
00869             0x0D3F,             /* MALAYALAM VOWEL SIGN I */
00870             0x0D40,             /* MALAYALAM VOWEL SIGN II */
00871             0x0D46,             /* MALAYALAM VOWEL SIGN E */
00872             0x0D47,             /* MALAYALAM VOWEL SIGN EE */
00873             0x0D48,             /* MALAYALAM VOWEL SIGN AI */
00874             0x0D4A,             /* MALAYALAM VOWEL SIGN O */
00875             0x0D4B,             /* MALAYALAM VOWEL SIGN OO */
00876             0x0D4C,             /* MALAYALAM VOWEL SIGN AU */
00877             0x0D57,             /* MALAYALAM AU LENGTH MARK */
00878             0x0D82,             /* SINHALA SIGN ANUSVARAYA */
00879             0x0D83,             /* SINHALA SIGN VISARGAYA */
00880             0x0DCF,             /* SINHALA VOWEL SIGN AELA-PILLA */
00881             0x0DD0,             /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
00882             0x0DD1,             /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
00883             0x0DD8,             /* SINHALA VOWEL SIGN GAETTA-PILLA */
00884             0x0DD9,             /* SINHALA VOWEL SIGN KOMBUVA */
00885             0x0DDA,             /* SINHALA VOWEL SIGN DIGA KOMBUVA */
00886             0x0DDB,             /* SINHALA VOWEL SIGN KOMBU DEKA */
00887             0x0DDC,             /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
00888             0x0DDD,             /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
00889                                  * AELA-PILLA */
00890             0x0DDE,             /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
00891             0x0DDF,             /* SINHALA VOWEL SIGN GAYANUKITTA */
00892             0x0DF2,             /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
00893             0x0DF3,             /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
00894             0x0F3E,             /* TIBETAN SIGN YAR TSHES */
00895             0x0F3F,             /* TIBETAN SIGN MAR TSHES */
00896             0x0F7F,             /* TIBETAN SIGN RNAM BCAD */
00897             0x102B,             /* MYANMAR VOWEL SIGN TALL AA */
00898             0x102C,             /* MYANMAR VOWEL SIGN AA */
00899             0x1031,             /* MYANMAR VOWEL SIGN E */
00900             0x1038,             /* MYANMAR SIGN VISARGA */
00901             0x103B,             /* MYANMAR CONSONANT SIGN MEDIAL YA */
00902             0x103C,             /* MYANMAR CONSONANT SIGN MEDIAL RA */
00903             0x1056,             /* MYANMAR VOWEL SIGN VOCALIC R */
00904             0x1057,             /* MYANMAR VOWEL SIGN VOCALIC RR */
00905             0x1062,             /* MYANMAR VOWEL SIGN SGAW KAREN EU */
00906             0x1063,             /* MYANMAR TONE MARK SGAW KAREN HATHI */
00907             0x1064,             /* MYANMAR TONE MARK SGAW KAREN KE PHO */
00908             0x1067,             /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
00909             0x1068,             /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
00910             0x1069,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
00911             0x106A,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
00912             0x106B,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
00913             0x106C,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
00914             0x106D,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
00915             0x1083,             /* MYANMAR VOWEL SIGN SHAN AA */
00916             0x1084,             /* MYANMAR VOWEL SIGN SHAN E */
00917             0x1087,             /* MYANMAR SIGN SHAN TONE-2 */
00918             0x1088,             /* MYANMAR SIGN SHAN TONE-3 */
00919             0x1089,             /* MYANMAR SIGN SHAN TONE-5 */
00920             0x108A,             /* MYANMAR SIGN SHAN TONE-6 */
00921             0x108B,             /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
00922             0x108C,             /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
00923             0x108F,             /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
00924             0x17B6,             /* KHMER VOWEL SIGN AA */
00925             0x17BE,             /* KHMER VOWEL SIGN OE */
00926             0x17BF,             /* KHMER VOWEL SIGN YA */
00927             0x17C0,             /* KHMER VOWEL SIGN IE */
00928             0x17C1,             /* KHMER VOWEL SIGN E */
00929             0x17C2,             /* KHMER VOWEL SIGN AE */
00930             0x17C3,             /* KHMER VOWEL SIGN AI */
00931             0x17C4,             /* KHMER VOWEL SIGN OO */
00932             0x17C5,             /* KHMER VOWEL SIGN AU */
00933             0x17C7,             /* KHMER SIGN REAHMUK */
00934             0x17C8,             /* KHMER SIGN YUUKALEAPINTU */
00935             0x1923,             /* LIMBU VOWEL SIGN EE */
00936             0x1924,             /* LIMBU VOWEL SIGN AI */
00937             0x1925,             /* LIMBU VOWEL SIGN OO */
00938             0x1926,             /* LIMBU VOWEL SIGN AU */
00939             0x1929,             /* LIMBU SUBJOINED LETTER YA */
00940             0x192A,             /* LIMBU SUBJOINED LETTER RA */
00941             0x192B,             /* LIMBU SUBJOINED LETTER WA */
00942             0x1930,             /* LIMBU SMALL LETTER KA */
00943             0x1931,             /* LIMBU SMALL LETTER NGA */
00944             0x1933,             /* LIMBU SMALL LETTER TA */
00945             0x1934,             /* LIMBU SMALL LETTER NA */
00946             0x1935,             /* LIMBU SMALL LETTER PA */
00947             0x1936,             /* LIMBU SMALL LETTER MA */
00948             0x1937,             /* LIMBU SMALL LETTER RA */
00949             0x1938,             /* LIMBU SMALL LETTER LA */
00950             0x19B0,             /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
00951             0x19B1,             /* NEW TAI LUE VOWEL SIGN AA */
00952             0x19B2,             /* NEW TAI LUE VOWEL SIGN II */
00953             0x19B3,             /* NEW TAI LUE VOWEL SIGN U */
00954             0x19B4,             /* NEW TAI LUE VOWEL SIGN UU */
00955             0x19B5,             /* NEW TAI LUE VOWEL SIGN E */
00956             0x19B6,             /* NEW TAI LUE VOWEL SIGN AE */
00957             0x19B7,             /* NEW TAI LUE VOWEL SIGN O */
00958             0x19B8,             /* NEW TAI LUE VOWEL SIGN OA */
00959             0x19B9,             /* NEW TAI LUE VOWEL SIGN UE */
00960             0x19BA,             /* NEW TAI LUE VOWEL SIGN AY */
00961             0x19BB,             /* NEW TAI LUE VOWEL SIGN AAY */
00962             0x19BC,             /* NEW TAI LUE VOWEL SIGN UY */
00963             0x19BD,             /* NEW TAI LUE VOWEL SIGN OY */
00964             0x19BE,             /* NEW TAI LUE VOWEL SIGN OAY */
00965             0x19BF,             /* NEW TAI LUE VOWEL SIGN UEY */
00966             0x19C0,             /* NEW TAI LUE VOWEL SIGN IY */
00967             0x19C8,             /* NEW TAI LUE TONE MARK-1 */
00968             0x19C9,             /* NEW TAI LUE TONE MARK-2 */
00969             0x1A19,             /* BUGINESE VOWEL SIGN E */
00970             0x1A1A,             /* BUGINESE VOWEL SIGN O */
00971             0x1A1B,             /* BUGINESE VOWEL SIGN AE */
00972             0x1B04,             /* BALINESE SIGN BISAH */
00973             0x1B35,             /* BALINESE VOWEL SIGN TEDUNG */
00974             0x1B3B,             /* BALINESE VOWEL SIGN RA REPA TEDUNG */
00975             0x1B3D,             /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
00976             0x1B3E,             /* BALINESE VOWEL SIGN TALING */
00977             0x1B3F,             /* BALINESE VOWEL SIGN TALING REPA */
00978             0x1B40,             /* BALINESE VOWEL SIGN TALING TEDUNG */
00979             0x1B41,             /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
00980             0x1B43,             /* BALINESE VOWEL SIGN PEPET TEDUNG */
00981             0x1B44,             /* BALINESE ADEG ADEG */
00982             0x1B82,             /* SUNDANESE SIGN PANGWISAD */
00983             0x1BA1,             /* SUNDANESE CONSONANT SIGN PAMINGKAL */
00984             0x1BA6,             /* SUNDANESE VOWEL SIGN PANAELAENG */
00985             0x1BA7,             /* SUNDANESE VOWEL SIGN PANOLONG */
00986             0x1BAA,             /* SUNDANESE SIGN PAMAAEH */
00987             0x1C24,             /* LEPCHA SUBJOINED LETTER YA */
00988             0x1C25,             /* LEPCHA SUBJOINED LETTER RA */
00989             0x1C26,             /* LEPCHA VOWEL SIGN AA */
00990             0x1C27,             /* LEPCHA VOWEL SIGN I */
00991             0x1C28,             /* LEPCHA VOWEL SIGN O */
00992             0x1C29,             /* LEPCHA VOWEL SIGN OO */
00993             0x1C2A,             /* LEPCHA VOWEL SIGN U */
00994             0x1C2B,             /* LEPCHA VOWEL SIGN UU */
00995             0x1C34,             /* LEPCHA CONSONANT SIGN NYIN-DO */
00996             0x1C35,             /* LEPCHA CONSONANT SIGN KANG */
00997             0xA823,             /* SYLOTI NAGRI VOWEL SIGN A */
00998             0xA824,             /* SYLOTI NAGRI VOWEL SIGN I */
00999             0xA827,             /* SYLOTI NAGRI VOWEL SIGN OO */
01000             0xA880,             /* SAURASHTRA SIGN ANUSVARA */
01001             0xA881,             /* SAURASHTRA SIGN VISARGA */
01002             0xA8B4,             /* SAURASHTRA CONSONANT SIGN HAARU */
01003             0xA8B5,             /* SAURASHTRA VOWEL SIGN AA */
01004             0xA8B6,             /* SAURASHTRA VOWEL SIGN I */
01005             0xA8B7,             /* SAURASHTRA VOWEL SIGN II */
01006             0xA8B8,             /* SAURASHTRA VOWEL SIGN U */
01007             0xA8B9,             /* SAURASHTRA VOWEL SIGN UU */
01008             0xA8BA,             /* SAURASHTRA VOWEL SIGN VOCALIC R */
01009             0xA8BB,             /* SAURASHTRA VOWEL SIGN VOCALIC RR */
01010             0xA8BC,             /* SAURASHTRA VOWEL SIGN VOCALIC L */
01011             0xA8BD,             /* SAURASHTRA VOWEL SIGN VOCALIC LL */
01012             0xA8BE,             /* SAURASHTRA VOWEL SIGN E */
01013             0xA8BF,             /* SAURASHTRA VOWEL SIGN EE */
01014             0xA8C0,             /* SAURASHTRA VOWEL SIGN AI */
01015             0xA8C1,             /* SAURASHTRA VOWEL SIGN O */
01016             0xA8C2,             /* SAURASHTRA VOWEL SIGN OO */
01017             0xA8C3,             /* SAURASHTRA VOWEL SIGN AU */
01018             0xA952,             /* REJANG CONSONANT SIGN H */
01019             0xA953,             /* REJANG VIRAMA */
01020             0xAA2F,             /* CHAM VOWEL SIGN O */
01021             0xAA30,             /* CHAM VOWEL SIGN AI */
01022             0xAA33,             /* CHAM CONSONANT SIGN YA */
01023             0xAA34,             /* CHAM CONSONANT SIGN RA */
01024             0xAA4D              /* CHAM CONSONANT SIGN FINAL H */
01025         };
01026         pg_wchar   *StopLow = strange_letter,
01027                    *StopHigh = strange_letter + lengthof(strange_letter),
01028                    *StopMiddle;
01029         pg_wchar    c;
01030 
01031         if (prs->pgwstr)
01032             c = *(prs->pgwstr + prs->state->poschar);
01033         else
01034             c = (pg_wchar) *(prs->wstr + prs->state->poschar);
01035 
01036         while (StopLow < StopHigh)
01037         {
01038             StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
01039             if (*StopMiddle == c)
01040                 return 1;
01041             else if (*StopMiddle < c)
01042                 StopLow = StopMiddle + 1;
01043             else
01044                 StopHigh = StopMiddle;
01045         }
01046     }
01047 #endif
01048 
01049     return 0;
01050 }
01051 
01052 /*
01053  * Table of state/action of parser
01054  */
01055 
01056 static const TParserStateActionItem actionTPS_Base[] = {
01057     {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
01058     {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
01059     {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
01060     {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
01061     {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
01062     {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
01063     {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
01064     {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
01065     {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
01066     {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
01067     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
01068     {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
01069     {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
01070 };
01071 
01072 
01073 static const TParserStateActionItem actionTPS_InNumWord[] = {
01074     {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
01075     {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
01076     {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
01077     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
01078     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
01079     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
01080     {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
01081     {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
01082 };
01083 
01084 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
01085     {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
01086     {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
01087     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
01088     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
01089     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01090     {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
01091     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01092     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
01093     {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
01094     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
01095     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
01096     {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
01097     {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
01098     {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
01099     {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
01100 };
01101 
01102 static const TParserStateActionItem actionTPS_InWord[] = {
01103     {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
01104     {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
01105     {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
01106     {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
01107     {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
01108     {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
01109 };
01110 
01111 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
01112     {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
01113     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
01114     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
01115     {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
01116     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01117     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01118     {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
01119     {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
01120     {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
01121     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
01122     {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
01123 };
01124 
01125 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
01126     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01127     {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
01128     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01129 };
01130 
01131 static const TParserStateActionItem actionTPS_InSignedInt[] = {
01132     {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
01133     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
01134     {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
01135     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01136     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01137     {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
01138 };
01139 
01140 static const TParserStateActionItem actionTPS_InSpace[] = {
01141     {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
01142     {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
01143     {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
01144     {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
01145     {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
01146     {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
01147     {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
01148     {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
01149     {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
01150 };
01151 
01152 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
01153     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01154     {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
01155     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01156 };
01157 
01158 static const TParserStateActionItem actionTPS_InUDecimal[] = {
01159     {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
01160     {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
01161     {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
01162     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01163     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01164     {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
01165 };
01166 
01167 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
01168     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01169     {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
01170     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01171 };
01172 
01173 static const TParserStateActionItem actionTPS_InDecimal[] = {
01174     {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
01175     {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
01176     {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
01177     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01178     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01179     {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
01180 };
01181 
01182 static const TParserStateActionItem actionTPS_InVerVersion[] = {
01183     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01184     {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
01185     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01186 };
01187 
01188 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
01189     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01190     {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
01191     {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
01192 };
01193 
01194 
01195 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
01196     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01197     {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
01198     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01199 };
01200 
01201 static const TParserStateActionItem actionTPS_InVersion[] = {
01202     {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
01203     {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
01204     {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
01205     {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
01206 };
01207 
01208 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
01209     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01210     {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
01211     {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
01212     {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
01213     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01214 };
01215 
01216 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
01217     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01218     {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
01219     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01220 };
01221 
01222 static const TParserStateActionItem actionTPS_InMantissa[] = {
01223     {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
01224     {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
01225     {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
01226 };
01227 
01228 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
01229     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01230     {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
01231     {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
01232     {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
01233     {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
01234     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01235 };
01236 
01237 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
01238     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01239     {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
01240     {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
01241     {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
01242     {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
01243     {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
01244     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
01245     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01246 };
01247 
01248 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
01249     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01250     {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
01251     {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
01252     {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
01253     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01254 };
01255 
01256 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
01257     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01258     {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
01259     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01260 };
01261 
01262 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
01263     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01264     {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
01265     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
01266     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01267 };
01268 
01269 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
01270     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01271     {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
01272     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
01273     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01274 };
01275 
01276 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
01277     {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
01278 };
01279 
01280 static const TParserStateActionItem actionTPS_InTagFirst[] = {
01281     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01282     {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
01283     {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
01284     {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
01285     {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
01286     {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
01287     {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
01288     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01289 };
01290 
01291 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
01292     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01293     /* <?xml ... */
01294     /* XXX do we wants states for the m and l ?  Right now this accepts <?xZ */
01295     {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
01296     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01297 };
01298 
01299 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
01300     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01301     {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
01302     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01303 };
01304 
01305 static const TParserStateActionItem actionTPS_InTagName[] = {
01306     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01307     /* <br/> case */
01308     {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
01309     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
01310     {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
01311     {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
01312     {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
01313     {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
01314     {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
01315     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
01316     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01317 };
01318 
01319 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
01320     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01321     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
01322     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01323 };
01324 
01325 static const TParserStateActionItem actionTPS_InTag[] = {
01326     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01327     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
01328     {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
01329     {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
01330     {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
01331     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
01332     {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
01333     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
01334     {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
01335     {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
01336     {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
01337     {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
01338     {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
01339     {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
01340     {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
01341     {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
01342     {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
01343     {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
01344     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01345 };
01346 
01347 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
01348     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01349     {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
01350     {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
01351     {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
01352 };
01353 
01354 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
01355     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01356     {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
01357     {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
01358     {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
01359 };
01360 
01361 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
01362     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01363     {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
01364 };
01365 
01366 static const TParserStateActionItem actionTPS_InTagEnd[] = {
01367     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
01368 };
01369 
01370 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
01371     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01372     {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
01373     /* <!DOCTYPE ...> */
01374     {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
01375     {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
01376     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01377 };
01378 
01379 static const TParserStateActionItem actionTPS_InCommentLast[] = {
01380     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01381     {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
01382     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01383 };
01384 
01385 static const TParserStateActionItem actionTPS_InComment[] = {
01386     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01387     {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
01388     {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
01389 };
01390 
01391 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
01392     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01393     {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
01394     {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
01395 };
01396 
01397 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
01398     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01399     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
01400     {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
01401     {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
01402 };
01403 
01404 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
01405     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
01406 };
01407 
01408 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
01409     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01410     {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
01411     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
01412     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01413 };
01414 
01415 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
01416     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01417     {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
01418     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
01419     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01420     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01421     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
01422     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
01423     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01424 };
01425 
01426 static const TParserStateActionItem actionTPS_InHostDomain[] = {
01427     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
01428     {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
01429     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
01430     {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
01431     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01432     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01433     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
01434     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
01435     {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
01436     {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
01437     {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
01438     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
01439 };
01440 
01441 static const TParserStateActionItem actionTPS_InPortFirst[] = {
01442     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01443     {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
01444     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01445 };
01446 
01447 static const TParserStateActionItem actionTPS_InPort[] = {
01448     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
01449     {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
01450     {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
01451     {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
01452     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
01453 };
01454 
01455 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
01456     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01457     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
01458     {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
01459     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01460 };
01461 
01462 static const TParserStateActionItem actionTPS_InHost[] = {
01463     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01464     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
01465     {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
01466     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
01467     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
01468     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01469     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01470     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01471 };
01472 
01473 static const TParserStateActionItem actionTPS_InEmail[] = {
01474     {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
01475     {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
01476     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01477 };
01478 
01479 static const TParserStateActionItem actionTPS_InFileFirst[] = {
01480     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01481     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
01482     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
01483     {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
01484     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
01485     {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
01486     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01487 };
01488 
01489 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
01490     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01491     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
01492     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
01493     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
01494     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
01495     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01496 };
01497 
01498 static const TParserStateActionItem actionTPS_InPathFirst[] = {
01499     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01500     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
01501     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
01502     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
01503     {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
01504     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
01505     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01506 };
01507 
01508 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
01509     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01510     {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
01511     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
01512     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01513 };
01514 
01515 static const TParserStateActionItem actionTPS_InPathSecond[] = {
01516     {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
01517     {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
01518     {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
01519     {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
01520     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01521 };
01522 
01523 static const TParserStateActionItem actionTPS_InFile[] = {
01524     {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
01525     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
01526     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
01527     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
01528     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
01529     {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
01530     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
01531     {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
01532 };
01533 
01534 static const TParserStateActionItem actionTPS_InFileNext[] = {
01535     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01536     {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
01537     {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
01538     {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
01539     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01540 };
01541 
01542 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
01543     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01544     {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
01545     {NULL, 0, A_POP, TPS_Null, 0, NULL},
01546 };
01547 
01548 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
01549     {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
01550 };
01551 
01552 static const TParserStateActionItem actionTPS_InURLPath[] = {
01553     {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
01554     {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
01555     {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
01556 };
01557 
01558 static const TParserStateActionItem actionTPS_InFURL[] = {
01559     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01560     {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
01561     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01562 };
01563 
01564 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
01565     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01566     {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
01567     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01568 };
01569 
01570 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
01571     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01572     {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
01573     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01574 };
01575 
01576 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
01577     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
01578 };
01579 
01580 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
01581     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01582     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
01583     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
01584     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
01585     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01586 };
01587 
01588 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
01589     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
01590     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
01591     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
01592     {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
01593     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01594     {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
01595     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
01596 };
01597 
01598 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
01599     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01600     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
01601     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
01602     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01603 };
01604 
01605 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
01606     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
01607     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
01608     {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
01609     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01610     {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
01611     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
01612 };
01613 
01614 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
01615     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01616     {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01617     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
01618     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01619 };
01620 
01621 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
01622     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
01623     {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01624     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01625     {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
01626     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
01627 };
01628 
01629 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
01630     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01631     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
01632     {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01633     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01634     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01635 };
01636 
01637 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
01638     {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
01639     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
01640     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
01641     {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
01642     {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
01643     {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
01644 };
01645 
01646 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
01647     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01648     {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
01649     {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
01650     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01651 };
01652 
01653 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
01654     {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
01655     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
01656     {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
01657     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
01658     {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
01659 };
01660 
01661 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
01662     {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
01663     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
01664     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
01665     {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
01666     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
01667     {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
01668 };
01669 
01670 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
01671     {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
01672     {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
01673     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
01674     {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
01675 };
01676 
01677 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
01678     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01679     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
01680     {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
01681     {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
01682     {NULL, 0, A_POP, TPS_Null, 0, NULL}
01683 };
01684 
01685 
01686 /*
01687  * main table of per-state parser actions
01688  */
01689 typedef struct
01690 {
01691     const TParserStateActionItem *action;       /* the actual state info */
01692     TParserState state;         /* only for Assert crosscheck */
01693 #ifdef WPARSER_TRACE
01694     const char *state_name;     /* only for debug printout */
01695 #endif
01696 } TParserStateAction;
01697 
01698 #ifdef WPARSER_TRACE
01699 #define TPARSERSTATEACTION(state) \
01700     { CppConcat(action,state), state, CppAsString(state) }
01701 #else
01702 #define TPARSERSTATEACTION(state) \
01703     { CppConcat(action,state), state }
01704 #endif
01705 
01706 /*
01707  * order must be the same as in typedef enum {} TParserState!!
01708  */
01709 
01710 static const TParserStateAction Actions[] = {
01711     TPARSERSTATEACTION(TPS_Base),
01712     TPARSERSTATEACTION(TPS_InNumWord),
01713     TPARSERSTATEACTION(TPS_InAsciiWord),
01714     TPARSERSTATEACTION(TPS_InWord),
01715     TPARSERSTATEACTION(TPS_InUnsignedInt),
01716     TPARSERSTATEACTION(TPS_InSignedIntFirst),
01717     TPARSERSTATEACTION(TPS_InSignedInt),
01718     TPARSERSTATEACTION(TPS_InSpace),
01719     TPARSERSTATEACTION(TPS_InUDecimalFirst),
01720     TPARSERSTATEACTION(TPS_InUDecimal),
01721     TPARSERSTATEACTION(TPS_InDecimalFirst),
01722     TPARSERSTATEACTION(TPS_InDecimal),
01723     TPARSERSTATEACTION(TPS_InVerVersion),
01724     TPARSERSTATEACTION(TPS_InSVerVersion),
01725     TPARSERSTATEACTION(TPS_InVersionFirst),
01726     TPARSERSTATEACTION(TPS_InVersion),
01727     TPARSERSTATEACTION(TPS_InMantissaFirst),
01728     TPARSERSTATEACTION(TPS_InMantissaSign),
01729     TPARSERSTATEACTION(TPS_InMantissa),
01730     TPARSERSTATEACTION(TPS_InXMLEntityFirst),
01731     TPARSERSTATEACTION(TPS_InXMLEntity),
01732     TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
01733     TPARSERSTATEACTION(TPS_InXMLEntityNum),
01734     TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
01735     TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
01736     TPARSERSTATEACTION(TPS_InXMLEntityEnd),
01737     TPARSERSTATEACTION(TPS_InTagFirst),
01738     TPARSERSTATEACTION(TPS_InXMLBegin),
01739     TPARSERSTATEACTION(TPS_InTagCloseFirst),
01740     TPARSERSTATEACTION(TPS_InTagName),
01741     TPARSERSTATEACTION(TPS_InTagBeginEnd),
01742     TPARSERSTATEACTION(TPS_InTag),
01743     TPARSERSTATEACTION(TPS_InTagEscapeK),
01744     TPARSERSTATEACTION(TPS_InTagEscapeKK),
01745     TPARSERSTATEACTION(TPS_InTagBackSleshed),
01746     TPARSERSTATEACTION(TPS_InTagEnd),
01747     TPARSERSTATEACTION(TPS_InCommentFirst),
01748     TPARSERSTATEACTION(TPS_InCommentLast),
01749     TPARSERSTATEACTION(TPS_InComment),
01750     TPARSERSTATEACTION(TPS_InCloseCommentFirst),
01751     TPARSERSTATEACTION(TPS_InCloseCommentLast),
01752     TPARSERSTATEACTION(TPS_InCommentEnd),
01753     TPARSERSTATEACTION(TPS_InHostFirstDomain),
01754     TPARSERSTATEACTION(TPS_InHostDomainSecond),
01755     TPARSERSTATEACTION(TPS_InHostDomain),
01756     TPARSERSTATEACTION(TPS_InPortFirst),
01757     TPARSERSTATEACTION(TPS_InPort),
01758     TPARSERSTATEACTION(TPS_InHostFirstAN),
01759     TPARSERSTATEACTION(TPS_InHost),
01760     TPARSERSTATEACTION(TPS_InEmail),
01761     TPARSERSTATEACTION(TPS_InFileFirst),
01762     TPARSERSTATEACTION(TPS_InFileTwiddle),
01763     TPARSERSTATEACTION(TPS_InPathFirst),
01764     TPARSERSTATEACTION(TPS_InPathFirstFirst),
01765     TPARSERSTATEACTION(TPS_InPathSecond),
01766     TPARSERSTATEACTION(TPS_InFile),
01767     TPARSERSTATEACTION(TPS_InFileNext),
01768     TPARSERSTATEACTION(TPS_InURLPathFirst),
01769     TPARSERSTATEACTION(TPS_InURLPathStart),
01770     TPARSERSTATEACTION(TPS_InURLPath),
01771     TPARSERSTATEACTION(TPS_InFURL),
01772     TPARSERSTATEACTION(TPS_InProtocolFirst),
01773     TPARSERSTATEACTION(TPS_InProtocolSecond),
01774     TPARSERSTATEACTION(TPS_InProtocolEnd),
01775     TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
01776     TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
01777     TPARSERSTATEACTION(TPS_InHyphenWordFirst),
01778     TPARSERSTATEACTION(TPS_InHyphenWord),
01779     TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
01780     TPARSERSTATEACTION(TPS_InHyphenNumWord),
01781     TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
01782     TPARSERSTATEACTION(TPS_InParseHyphen),
01783     TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
01784     TPARSERSTATEACTION(TPS_InHyphenWordPart),
01785     TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
01786     TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
01787     TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
01788 };
01789 
01790 
01791 static bool
01792 TParserGet(TParser *prs)
01793 {
01794     const TParserStateActionItem *item = NULL;
01795 
01796     Assert(prs->state);
01797 
01798     if (prs->state->posbyte >= prs->lenstr)
01799         return false;
01800 
01801     prs->token = prs->str + prs->state->posbyte;
01802     prs->state->pushedAtAction = NULL;
01803 
01804     /* look at string */
01805     while (prs->state->posbyte <= prs->lenstr)
01806     {
01807         if (prs->state->posbyte == prs->lenstr)
01808             prs->state->charlen = 0;
01809         else
01810             prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
01811                 pg_mblen(prs->str + prs->state->posbyte);
01812 
01813         Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
01814         Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
01815         Assert(Actions[prs->state->state].state == prs->state->state);
01816 
01817         if (prs->state->pushedAtAction)
01818         {
01819             /* After a POP, pick up at the next test */
01820             item = prs->state->pushedAtAction + 1;
01821             prs->state->pushedAtAction = NULL;
01822         }
01823         else
01824         {
01825             item = Actions[prs->state->state].action;
01826             Assert(item != NULL);
01827         }
01828 
01829         /* find action by character class */
01830         while (item->isclass)
01831         {
01832             prs->c = item->c;
01833             if (item->isclass(prs) != 0)
01834                 break;
01835             item++;
01836         }
01837 
01838 #ifdef WPARSER_TRACE
01839         {
01840             TParserPosition *ptr;
01841 
01842             fprintf(stderr, "state ");
01843             /* indent according to stack depth */
01844             for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
01845                 fprintf(stderr, "  ");
01846             fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
01847             if (prs->state->posbyte < prs->lenstr)
01848                 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
01849             else
01850                 fprintf(stderr, "at EOF");
01851             fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
01852                     (int) (item - Actions[prs->state->state].action),
01853                     (item->flags & A_BINGO) ? " BINGO" : "",
01854                     (item->flags & A_POP) ? " POP" : "",
01855                     (item->flags & A_PUSH) ? " PUSH" : "",
01856                     (item->flags & A_RERUN) ? " RERUN" : "",
01857                     (item->flags & A_CLEAR) ? " CLEAR" : "",
01858                     (item->flags & A_MERGE) ? " MERGE" : "",
01859                     (item->flags & A_CLRALL) ? " CLRALL" : "",
01860                     (item->tostate != TPS_Null) ? " tostate " : "",
01861                     (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
01862                     (item->type > 0) ? " type " : "",
01863                     tok_alias[item->type]);
01864         }
01865 #endif
01866 
01867         /* call special handler if exists */
01868         if (item->special)
01869             item->special(prs);
01870 
01871         /* BINGO, token is found */
01872         if (item->flags & A_BINGO)
01873         {
01874             Assert(item->type > 0);
01875             prs->lenbytetoken = prs->state->lenbytetoken;
01876             prs->lenchartoken = prs->state->lenchartoken;
01877             prs->state->lenbytetoken = prs->state->lenchartoken = 0;
01878             prs->type = item->type;
01879         }
01880 
01881         /* do various actions by flags */
01882         if (item->flags & A_POP)
01883         {                       /* pop stored state in stack */
01884             TParserPosition *ptr = prs->state->prev;
01885 
01886             pfree(prs->state);
01887             prs->state = ptr;
01888             Assert(prs->state);
01889         }
01890         else if (item->flags & A_PUSH)
01891         {                       /* push (store) state in stack */
01892             prs->state->pushedAtAction = item;  /* remember where we push */
01893             prs->state = newTParserPosition(prs->state);
01894         }
01895         else if (item->flags & A_CLEAR)
01896         {                       /* clear previous pushed state */
01897             TParserPosition *ptr;
01898 
01899             Assert(prs->state->prev);
01900             ptr = prs->state->prev->prev;
01901             pfree(prs->state->prev);
01902             prs->state->prev = ptr;
01903         }
01904         else if (item->flags & A_CLRALL)
01905         {                       /* clear all previous pushed state */
01906             TParserPosition *ptr;
01907 
01908             while (prs->state->prev)
01909             {
01910                 ptr = prs->state->prev->prev;
01911                 pfree(prs->state->prev);
01912                 prs->state->prev = ptr;
01913             }
01914         }
01915         else if (item->flags & A_MERGE)
01916         {                       /* merge posinfo with current and pushed state */
01917             TParserPosition *ptr = prs->state;
01918 
01919             Assert(prs->state->prev);
01920             prs->state = prs->state->prev;
01921 
01922             prs->state->posbyte = ptr->posbyte;
01923             prs->state->poschar = ptr->poschar;
01924             prs->state->charlen = ptr->charlen;
01925             prs->state->lenbytetoken = ptr->lenbytetoken;
01926             prs->state->lenchartoken = ptr->lenchartoken;
01927             pfree(ptr);
01928         }
01929 
01930         /* set new state if pointed */
01931         if (item->tostate != TPS_Null)
01932             prs->state->state = item->tostate;
01933 
01934         /* check for go away */
01935         if ((item->flags & A_BINGO) ||
01936             (prs->state->posbyte >= prs->lenstr &&
01937              (item->flags & A_RERUN) == 0))
01938             break;
01939 
01940         /* go to beginning of loop if we should rerun or we just restore state */
01941         if (item->flags & (A_RERUN | A_POP))
01942             continue;
01943 
01944         /* move forward */
01945         if (prs->state->charlen)
01946         {
01947             prs->state->posbyte += prs->state->charlen;
01948             prs->state->lenbytetoken += prs->state->charlen;
01949             prs->state->poschar++;
01950             prs->state->lenchartoken++;
01951         }
01952     }
01953 
01954     return (item && (item->flags & A_BINGO)) ? true : false;
01955 }
01956 
01957 Datum
01958 prsd_lextype(PG_FUNCTION_ARGS)
01959 {
01960     LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
01961     int         i;
01962 
01963     for (i = 1; i <= LASTNUM; i++)
01964     {
01965         descr[i - 1].lexid = i;
01966         descr[i - 1].alias = pstrdup(tok_alias[i]);
01967         descr[i - 1].descr = pstrdup(lex_descr[i]);
01968     }
01969 
01970     descr[LASTNUM].lexid = 0;
01971 
01972     PG_RETURN_POINTER(descr);
01973 }
01974 
01975 Datum
01976 prsd_start(PG_FUNCTION_ARGS)
01977 {
01978     PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
01979 }
01980 
01981 Datum
01982 prsd_nexttoken(PG_FUNCTION_ARGS)
01983 {
01984     TParser    *p = (TParser *) PG_GETARG_POINTER(0);
01985     char      **t = (char **) PG_GETARG_POINTER(1);
01986     int        *tlen = (int *) PG_GETARG_POINTER(2);
01987 
01988     if (!TParserGet(p))
01989         PG_RETURN_INT32(0);
01990 
01991     *t = p->token;
01992     *tlen = p->lenbytetoken;
01993 
01994     PG_RETURN_INT32(p->type);
01995 }
01996 
01997 Datum
01998 prsd_end(PG_FUNCTION_ARGS)
01999 {
02000     TParser    *p = (TParser *) PG_GETARG_POINTER(0);
02001 
02002     TParserClose(p);
02003     PG_RETURN_VOID();
02004 }
02005 
02006 #define LEAVETOKEN(x)   ( (x)==SPACE )
02007 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
02008 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
02009 
02010 #define TS_IDIGNORE(x)  ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
02011 #define HLIDREPLACE(x)  ( (x)==TAG_T )
02012 #define HLIDSKIP(x)     ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
02013 #define XMLHLIDSKIP(x)  ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
02014 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
02015 #define NOENDTOKEN(x)   ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
02016 
02017 typedef struct
02018 {
02019     HeadlineWordEntry *words;
02020     int         len;
02021 } hlCheck;
02022 
02023 static bool
02024 checkcondition_HL(void *checkval, QueryOperand *val)
02025 {
02026     int         i;
02027 
02028     for (i = 0; i < ((hlCheck *) checkval)->len; i++)
02029     {
02030         if (((hlCheck *) checkval)->words[i].item == val)
02031             return true;
02032     }
02033     return false;
02034 }
02035 
02036 
02037 static bool
02038 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
02039 {
02040     int         i,
02041                 j;
02042     QueryItem  *item = GETQUERY(query);
02043     int         pos = *p;
02044 
02045     *q = -1;
02046     *p = 0x7fffffff;
02047 
02048     for (j = 0; j < query->size; j++)
02049     {
02050         if (item->type != QI_VAL)
02051         {
02052             item++;
02053             continue;
02054         }
02055         for (i = pos; i < prs->curwords; i++)
02056         {
02057             if (prs->words[i].item == &item->qoperand)
02058             {
02059                 if (i > *q)
02060                     *q = i;
02061                 break;
02062             }
02063         }
02064         item++;
02065     }
02066 
02067     if (*q < 0)
02068         return false;
02069 
02070     item = GETQUERY(query);
02071     for (j = 0; j < query->size; j++)
02072     {
02073         if (item->type != QI_VAL)
02074         {
02075             item++;
02076             continue;
02077         }
02078         for (i = *q; i >= pos; i--)
02079         {
02080             if (prs->words[i].item == &item->qoperand)
02081             {
02082                 if (i < *p)
02083                     *p = i;
02084                 break;
02085             }
02086         }
02087         item++;
02088     }
02089 
02090     if (*p <= *q)
02091     {
02092         hlCheck     ch;
02093 
02094         ch.words = &(prs->words[*p]);
02095         ch.len = *q - *p + 1;
02096         if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
02097             return true;
02098         else
02099         {
02100             (*p)++;
02101             return hlCover(prs, query, p, q);
02102         }
02103     }
02104 
02105     return false;
02106 }
02107 
02108 static void
02109 mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
02110 {
02111     int         i;
02112 
02113     for (i = startpos; i <= endpos; i++)
02114     {
02115         if (prs->words[i].item)
02116             prs->words[i].selected = 1;
02117         if (highlight == 0)
02118         {
02119             if (HLIDREPLACE(prs->words[i].type))
02120                 prs->words[i].replace = 1;
02121             else if (HLIDSKIP(prs->words[i].type))
02122                 prs->words[i].skip = 1;
02123         }
02124         else
02125         {
02126             if (XMLHLIDSKIP(prs->words[i].type))
02127                 prs->words[i].skip = 1;
02128         }
02129 
02130         prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
02131     }
02132 }
02133 
02134 typedef struct
02135 {
02136     int32       startpos;
02137     int32       endpos;
02138     int32       poslen;
02139     int32       curlen;
02140     int16       in;
02141     int16       excluded;
02142 } CoverPos;
02143 
02144 static void
02145 get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
02146                   int *curlen, int *poslen, int max_words)
02147 {
02148     int         i;
02149 
02150     /*
02151      * Objective: Generate a fragment of words between startpos and endpos
02152      * such that it has at most max_words and both ends has query words. If
02153      * the startpos and endpos are the endpoints of the cover and the cover
02154      * has fewer words than max_words, then this function should just return
02155      * the cover
02156      */
02157     /* first move startpos to an item */
02158     for (i = *startpos; i <= *endpos; i++)
02159     {
02160         *startpos = i;
02161         if (prs->words[i].item && !prs->words[i].repeated)
02162             break;
02163     }
02164     /* cut endpos to have only max_words */
02165     *curlen = 0;
02166     *poslen = 0;
02167     for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
02168     {
02169         if (!NONWORDTOKEN(prs->words[i].type))
02170             *curlen += 1;
02171         if (prs->words[i].item && !prs->words[i].repeated)
02172             *poslen += 1;
02173     }
02174     /* if the cover was cut then move back endpos to a query item */
02175     if (*endpos > i)
02176     {
02177         *endpos = i;
02178         for (i = *endpos; i >= *startpos; i--)
02179         {
02180             *endpos = i;
02181             if (prs->words[i].item && !prs->words[i].repeated)
02182                 break;
02183             if (!NONWORDTOKEN(prs->words[i].type))
02184                 *curlen -= 1;
02185         }
02186     }
02187 }
02188 
02189 static void
02190 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
02191                   int shortword, int min_words,
02192                   int max_words, int max_fragments)
02193 {
02194     int32       poslen,
02195                 curlen,
02196                 i,
02197                 f,
02198                 num_f = 0;
02199     int32       stretch,
02200                 maxstretch,
02201                 posmarker;
02202 
02203     int32       startpos = 0,
02204                 endpos = 0,
02205                 p = 0,
02206                 q = 0;
02207 
02208     int32       numcovers = 0,
02209                 maxcovers = 32;
02210 
02211     int32       minI,
02212                 minwords,
02213                 maxitems;
02214     CoverPos   *covers;
02215 
02216     covers = palloc(maxcovers * sizeof(CoverPos));
02217 
02218     /* get all covers */
02219     while (hlCover(prs, query, &p, &q))
02220     {
02221         startpos = p;
02222         endpos = q;
02223 
02224         /*
02225          * Break the cover into smaller fragments such that each fragment has
02226          * at most max_words. Also ensure that each end of the fragment is a
02227          * query word. This will allow us to stretch the fragment in either
02228          * direction
02229          */
02230 
02231         while (startpos <= endpos)
02232         {
02233             get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
02234             if (numcovers >= maxcovers)
02235             {
02236                 maxcovers *= 2;
02237                 covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
02238             }
02239             covers[numcovers].startpos = startpos;
02240             covers[numcovers].endpos = endpos;
02241             covers[numcovers].curlen = curlen;
02242             covers[numcovers].poslen = poslen;
02243             covers[numcovers].in = 0;
02244             covers[numcovers].excluded = 0;
02245             numcovers++;
02246             startpos = endpos + 1;
02247             endpos = q;
02248         }
02249         /* move p to generate the next cover */
02250         p++;
02251     }
02252 
02253     /* choose best covers */
02254     for (f = 0; f < max_fragments; f++)
02255     {
02256         maxitems = 0;
02257         minwords = 0x7fffffff;
02258         minI = -1;
02259 
02260         /*
02261          * Choose the cover that contains max items. In case of tie choose the
02262          * one with smaller number of words.
02263          */
02264         for (i = 0; i < numcovers; i++)
02265         {
02266             if (!covers[i].in && !covers[i].excluded &&
02267                 (maxitems < covers[i].poslen || (maxitems == covers[i].poslen
02268                                             && minwords > covers[i].curlen)))
02269             {
02270                 maxitems = covers[i].poslen;
02271                 minwords = covers[i].curlen;
02272                 minI = i;
02273             }
02274         }
02275         /* if a cover was found mark it */
02276         if (minI >= 0)
02277         {
02278             covers[minI].in = 1;
02279             /* adjust the size of cover */
02280             startpos = covers[minI].startpos;
02281             endpos = covers[minI].endpos;
02282             curlen = covers[minI].curlen;
02283             /* stretch the cover if cover size is lower than max_words */
02284             if (curlen < max_words)
02285             {
02286                 /* divide the stretch on both sides of cover */
02287                 maxstretch = (max_words - curlen) / 2;
02288 
02289                 /*
02290                  * first stretch the startpos stop stretching if 1. we hit the
02291                  * beginning of document 2. exceed maxstretch 3. we hit an
02292                  * already marked fragment
02293                  */
02294                 stretch = 0;
02295                 posmarker = startpos;
02296                 for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
02297                 {
02298                     if (!NONWORDTOKEN(prs->words[i].type))
02299                     {
02300                         curlen++;
02301                         stretch++;
02302                     }
02303                     posmarker = i;
02304                 }
02305                 /* cut back startpos till we find a non short token */
02306                 for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
02307                 {
02308                     if (!NONWORDTOKEN(prs->words[i].type))
02309                         curlen--;
02310                 }
02311                 startpos = i;
02312                 /* now stretch the endpos as much as possible */
02313                 posmarker = endpos;
02314                 for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
02315                 {
02316                     if (!NONWORDTOKEN(prs->words[i].type))
02317                         curlen++;
02318                     posmarker = i;
02319                 }
02320                 /* cut back endpos till we find a non-short token */
02321                 for (i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
02322                 {
02323                     if (!NONWORDTOKEN(prs->words[i].type))
02324                         curlen--;
02325                 }
02326                 endpos = i;
02327             }
02328             covers[minI].startpos = startpos;
02329             covers[minI].endpos = endpos;
02330             covers[minI].curlen = curlen;
02331             /* Mark the chosen fragments (covers) */
02332             mark_fragment(prs, highlight, startpos, endpos);
02333             num_f++;
02334             /* exclude overlapping covers */
02335             for (i = 0; i < numcovers; i++)
02336             {
02337                 if (i != minI && ((covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
02338                     covers[i].excluded = 1;
02339             }
02340         }
02341         else
02342             break;
02343     }
02344 
02345     /* show at least min_words we have not marked anything */
02346     if (num_f <= 0)
02347     {
02348         startpos = endpos = curlen = 0;
02349         for (i = 0; i < prs->curwords && curlen < min_words; i++)
02350         {
02351             if (!NONWORDTOKEN(prs->words[i].type))
02352                 curlen++;
02353             endpos = i;
02354         }
02355         mark_fragment(prs, highlight, startpos, endpos);
02356     }
02357     pfree(covers);
02358 }
02359 
02360 static void
02361 mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
02362               int shortword, int min_words, int max_words)
02363 {
02364     int         p = 0,
02365                 q = 0;
02366     int         bestb = -1,
02367                 beste = -1;
02368     int         bestlen = -1;
02369     int         pose = 0,
02370                 posb,
02371                 poslen,
02372                 curlen;
02373 
02374     int         i;
02375 
02376     if (highlight == 0)
02377     {
02378         while (hlCover(prs, query, &p, &q))
02379         {
02380             /* find cover len in words */
02381             curlen = 0;
02382             poslen = 0;
02383             for (i = p; i <= q && curlen < max_words; i++)
02384             {
02385                 if (!NONWORDTOKEN(prs->words[i].type))
02386                     curlen++;
02387                 if (prs->words[i].item && !prs->words[i].repeated)
02388                     poslen++;
02389                 pose = i;
02390             }
02391 
02392             if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
02393             {
02394                 /* best already finded, so try one more cover */
02395                 p++;
02396                 continue;
02397             }
02398 
02399             posb = p;
02400             if (curlen < max_words)
02401             {                   /* find good end */
02402                 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
02403                 {
02404                     if (i != q)
02405                     {
02406                         if (!NONWORDTOKEN(prs->words[i].type))
02407                             curlen++;
02408                         if (prs->words[i].item && !prs->words[i].repeated)
02409                             poslen++;
02410                     }
02411                     pose = i;
02412                     if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
02413                         continue;
02414                     if (curlen >= min_words)
02415                         break;
02416                 }
02417                 if (curlen < min_words && i >= prs->curwords)
02418                 {               /* got end of text and our cover is shoter
02419                                  * than min_words */
02420                     for (i = p - 1; i >= 0; i--)
02421                     {
02422                         if (!NONWORDTOKEN(prs->words[i].type))
02423                             curlen++;
02424                         if (prs->words[i].item && !prs->words[i].repeated)
02425                             poslen++;
02426                         if (curlen >= max_words)
02427                             break;
02428                         if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
02429                             continue;
02430                         if (curlen >= min_words)
02431                             break;
02432                     }
02433                     posb = (i >= 0) ? i : 0;
02434                 }
02435             }
02436             else
02437             {                   /* shorter cover :((( */
02438                 for (; curlen > min_words; i--)
02439                 {
02440                     if (!NONWORDTOKEN(prs->words[i].type))
02441                         curlen--;
02442                     if (prs->words[i].item && !prs->words[i].repeated)
02443                         poslen--;
02444                     pose = i;
02445                     if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
02446                         continue;
02447                     break;
02448                 }
02449             }
02450 
02451             if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
02452                 (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
02453                  (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
02454             {
02455                 bestb = posb;
02456                 beste = pose;
02457                 bestlen = poslen;
02458             }
02459 
02460             p++;
02461         }
02462 
02463         if (bestlen < 0)
02464         {
02465             curlen = 0;
02466             for (i = 0; i < prs->curwords && curlen < min_words; i++)
02467             {
02468                 if (!NONWORDTOKEN(prs->words[i].type))
02469                     curlen++;
02470                 pose = i;
02471             }
02472             bestb = 0;
02473             beste = pose;
02474         }
02475     }
02476     else
02477     {
02478         bestb = 0;
02479         beste = prs->curwords - 1;
02480     }
02481 
02482     for (i = bestb; i <= beste; i++)
02483     {
02484         if (prs->words[i].item)
02485             prs->words[i].selected = 1;
02486         if (highlight == 0)
02487         {
02488             if (HLIDREPLACE(prs->words[i].type))
02489                 prs->words[i].replace = 1;
02490             else if (HLIDSKIP(prs->words[i].type))
02491                 prs->words[i].skip = 1;
02492         }
02493         else
02494         {
02495             if (XMLHLIDSKIP(prs->words[i].type))
02496                 prs->words[i].skip = 1;
02497         }
02498 
02499         prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
02500     }
02501 
02502 }
02503 
02504 Datum
02505 prsd_headline(PG_FUNCTION_ARGS)
02506 {
02507     HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
02508     List       *prsoptions = (List *) PG_GETARG_POINTER(1);
02509     TSQuery     query = PG_GETARG_TSQUERY(2);
02510 
02511     /* from opt + start and end tag */
02512     int         min_words = 15;
02513     int         max_words = 35;
02514     int         shortword = 3;
02515     int         max_fragments = 0;
02516     int         highlight = 0;
02517     ListCell   *l;
02518 
02519     /* config */
02520     prs->startsel = NULL;
02521     prs->stopsel = NULL;
02522     foreach(l, prsoptions)
02523     {
02524         DefElem    *defel = (DefElem *) lfirst(l);
02525         char       *val = defGetString(defel);
02526 
02527         if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
02528             max_words = pg_atoi(val, sizeof(int32), 0);
02529         else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
02530             min_words = pg_atoi(val, sizeof(int32), 0);
02531         else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
02532             shortword = pg_atoi(val, sizeof(int32), 0);
02533         else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
02534             max_fragments = pg_atoi(val, sizeof(int32), 0);
02535         else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
02536             prs->startsel = pstrdup(val);
02537         else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
02538             prs->stopsel = pstrdup(val);
02539         else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
02540             prs->fragdelim = pstrdup(val);
02541         else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
02542             highlight = (pg_strcasecmp(val, "1") == 0 ||
02543                          pg_strcasecmp(val, "on") == 0 ||
02544                          pg_strcasecmp(val, "true") == 0 ||
02545                          pg_strcasecmp(val, "t") == 0 ||
02546                          pg_strcasecmp(val, "y") == 0 ||
02547                          pg_strcasecmp(val, "yes") == 0);
02548         else
02549             ereport(ERROR,
02550                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
02551                      errmsg("unrecognized headline parameter: \"%s\"",
02552                             defel->defname)));
02553     }
02554 
02555     if (highlight == 0)
02556     {
02557         if (min_words >= max_words)
02558             ereport(ERROR,
02559                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
02560                      errmsg("MinWords should be less than MaxWords")));
02561         if (min_words <= 0)
02562             ereport(ERROR,
02563                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
02564                      errmsg("MinWords should be positive")));
02565         if (shortword < 0)
02566             ereport(ERROR,
02567                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
02568                      errmsg("ShortWord should be >= 0")));
02569         if (max_fragments < 0)
02570             ereport(ERROR,
02571                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
02572                      errmsg("MaxFragments should be >= 0")));
02573     }
02574 
02575     if (max_fragments == 0)
02576         /* call the default headline generator */
02577         mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
02578     else
02579         mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
02580 
02581     if (!prs->startsel)
02582         prs->startsel = pstrdup("<b>");
02583     if (!prs->stopsel)
02584         prs->stopsel = pstrdup("</b>");
02585     if (!prs->fragdelim)
02586         prs->fragdelim = pstrdup(" ... ");
02587     prs->startsellen = strlen(prs->startsel);
02588     prs->stopsellen = strlen(prs->stopsel);
02589     prs->fragdelimlen = strlen(prs->fragdelim);
02590 
02591     PG_RETURN_POINTER(prs);
02592 }