00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include "postgres.h"
00016
00017 #include "catalog/pg_collation.h"
00018 #include "commands/defrem.h"
00019 #include "tsearch/ts_locale.h"
00020 #include "tsearch/ts_public.h"
00021 #include "tsearch/ts_type.h"
00022 #include "tsearch/ts_utils.h"
00023 #include "utils/builtins.h"
00024
00025
00026
00027
00028
00029
00030
00031
00032 #define ASCIIWORD 1
00033 #define WORD_T 2
00034 #define NUMWORD 3
00035 #define EMAIL 4
00036 #define URL_T 5
00037 #define HOST 6
00038 #define SCIENTIFIC 7
00039 #define VERSIONNUMBER 8
00040 #define NUMPARTHWORD 9
00041 #define PARTHWORD 10
00042 #define ASCIIPARTHWORD 11
00043 #define SPACE 12
00044 #define TAG_T 13
00045 #define PROTOCOL 14
00046 #define NUMHWORD 15
00047 #define ASCIIHWORD 16
00048 #define HWORD 17
00049 #define URLPATH 18
00050 #define FILEPATH 19
00051 #define DECIMAL_T 20
00052 #define SIGNEDINT 21
00053 #define UNSIGNEDINT 22
00054 #define XMLENTITY 23
00055
00056 #define LASTNUM 23
00057
00058 static const char *const tok_alias[] = {
00059 "",
00060 "asciiword",
00061 "word",
00062 "numword",
00063 "email",
00064 "url",
00065 "host",
00066 "sfloat",
00067 "version",
00068 "hword_numpart",
00069 "hword_part",
00070 "hword_asciipart",
00071 "blank",
00072 "tag",
00073 "protocol",
00074 "numhword",
00075 "asciihword",
00076 "hword",
00077 "url_path",
00078 "file",
00079 "float",
00080 "int",
00081 "uint",
00082 "entity"
00083 };
00084
00085 static const char *const lex_descr[] = {
00086 "",
00087 "Word, all ASCII",
00088 "Word, all letters",
00089 "Word, letters and digits",
00090 "Email address",
00091 "URL",
00092 "Host",
00093 "Scientific notation",
00094 "Version number",
00095 "Hyphenated word part, letters and digits",
00096 "Hyphenated word part, all letters",
00097 "Hyphenated word part, all ASCII",
00098 "Space symbols",
00099 "XML tag",
00100 "Protocol head",
00101 "Hyphenated word, letters and digits",
00102 "Hyphenated word, all ASCII",
00103 "Hyphenated word, all letters",
00104 "URL path",
00105 "File or path name",
00106 "Decimal notation",
00107 "Signed integer",
00108 "Unsigned integer",
00109 "XML entity"
00110 };
00111
00112
00113
00114
00115 typedef enum
00116 {
00117 TPS_Base = 0,
00118 TPS_InNumWord,
00119 TPS_InAsciiWord,
00120 TPS_InWord,
00121 TPS_InUnsignedInt,
00122 TPS_InSignedIntFirst,
00123 TPS_InSignedInt,
00124 TPS_InSpace,
00125 TPS_InUDecimalFirst,
00126 TPS_InUDecimal,
00127 TPS_InDecimalFirst,
00128 TPS_InDecimal,
00129 TPS_InVerVersion,
00130 TPS_InSVerVersion,
00131 TPS_InVersionFirst,
00132 TPS_InVersion,
00133 TPS_InMantissaFirst,
00134 TPS_InMantissaSign,
00135 TPS_InMantissa,
00136 TPS_InXMLEntityFirst,
00137 TPS_InXMLEntity,
00138 TPS_InXMLEntityNumFirst,
00139 TPS_InXMLEntityNum,
00140 TPS_InXMLEntityHexNumFirst,
00141 TPS_InXMLEntityHexNum,
00142 TPS_InXMLEntityEnd,
00143 TPS_InTagFirst,
00144 TPS_InXMLBegin,
00145 TPS_InTagCloseFirst,
00146 TPS_InTagName,
00147 TPS_InTagBeginEnd,
00148 TPS_InTag,
00149 TPS_InTagEscapeK,
00150 TPS_InTagEscapeKK,
00151 TPS_InTagBackSleshed,
00152 TPS_InTagEnd,
00153 TPS_InCommentFirst,
00154 TPS_InCommentLast,
00155 TPS_InComment,
00156 TPS_InCloseCommentFirst,
00157 TPS_InCloseCommentLast,
00158 TPS_InCommentEnd,
00159 TPS_InHostFirstDomain,
00160 TPS_InHostDomainSecond,
00161 TPS_InHostDomain,
00162 TPS_InPortFirst,
00163 TPS_InPort,
00164 TPS_InHostFirstAN,
00165 TPS_InHost,
00166 TPS_InEmail,
00167 TPS_InFileFirst,
00168 TPS_InFileTwiddle,
00169 TPS_InPathFirst,
00170 TPS_InPathFirstFirst,
00171 TPS_InPathSecond,
00172 TPS_InFile,
00173 TPS_InFileNext,
00174 TPS_InURLPathFirst,
00175 TPS_InURLPathStart,
00176 TPS_InURLPath,
00177 TPS_InFURL,
00178 TPS_InProtocolFirst,
00179 TPS_InProtocolSecond,
00180 TPS_InProtocolEnd,
00181 TPS_InHyphenAsciiWordFirst,
00182 TPS_InHyphenAsciiWord,
00183 TPS_InHyphenWordFirst,
00184 TPS_InHyphenWord,
00185 TPS_InHyphenNumWordFirst,
00186 TPS_InHyphenNumWord,
00187 TPS_InHyphenDigitLookahead,
00188 TPS_InParseHyphen,
00189 TPS_InParseHyphenHyphen,
00190 TPS_InHyphenWordPart,
00191 TPS_InHyphenAsciiWordPart,
00192 TPS_InHyphenNumWordPart,
00193 TPS_InHyphenUnsignedInt,
00194 TPS_Null
00195 } TParserState;
00196
00197
00198 struct TParser;
00199
00200 typedef int (*TParserCharTest) (struct TParser *);
00201
00202 typedef void (*TParserSpecial) (struct TParser *);
00203
00204
00205 typedef struct
00206 {
00207 TParserCharTest isclass;
00208 char c;
00209 uint16 flags;
00210 TParserState tostate;
00211 int type;
00212 TParserSpecial special;
00213 } TParserStateActionItem;
00214
00215
00216 #define A_NEXT 0x0000
00217 #define A_BINGO 0x0001
00218 #define A_POP 0x0002
00219 #define A_PUSH 0x0004
00220 #define A_RERUN 0x0008
00221 #define A_CLEAR 0x0010
00222 #define A_MERGE 0x0020
00223 #define A_CLRALL 0x0040
00224
00225 typedef struct TParserPosition
00226 {
00227 int posbyte;
00228 int poschar;
00229 int charlen;
00230 int lenbytetoken;
00231 int lenchartoken;
00232 TParserState state;
00233 struct TParserPosition *prev;
00234 const TParserStateActionItem *pushedAtAction;
00235 } TParserPosition;
00236
00237 typedef struct TParser
00238 {
00239
00240 char *str;
00241 int lenstr;
00242 #ifdef USE_WIDE_UPPER_LOWER
00243 wchar_t *wstr;
00244 pg_wchar *pgwstr;
00245 bool usewide;
00246 #endif
00247
00248
00249 int charmaxlen;
00250 TParserPosition *state;
00251 bool ignore;
00252 bool wanthost;
00253
00254
00255 char c;
00256
00257
00258 char *token;
00259 int lenbytetoken;
00260 int lenchartoken;
00261 int type;
00262 } TParser;
00263
00264
00265
00266 static bool TParserGet(TParser *prs);
00267
00268
00269 static TParserPosition *
00270 newTParserPosition(TParserPosition *prev)
00271 {
00272 TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
00273
00274 if (prev)
00275 memcpy(res, prev, sizeof(TParserPosition));
00276 else
00277 memset(res, 0, sizeof(TParserPosition));
00278
00279 res->prev = prev;
00280
00281 res->pushedAtAction = NULL;
00282
00283 return res;
00284 }
00285
00286 static TParser *
00287 TParserInit(char *str, int len)
00288 {
00289 TParser *prs = (TParser *) palloc0(sizeof(TParser));
00290
00291 prs->charmaxlen = pg_database_encoding_max_length();
00292 prs->str = str;
00293 prs->lenstr = len;
00294
00295 #ifdef USE_WIDE_UPPER_LOWER
00296
00297
00298
00299
00300 if (prs->charmaxlen > 1)
00301 {
00302 Oid collation = DEFAULT_COLLATION_OID;
00303 pg_locale_t mylocale = 0;
00304
00305 prs->usewide = true;
00306 if (lc_ctype_is_c(collation))
00307 {
00308
00309
00310
00311
00312 prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
00313 pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
00314 }
00315 else
00316 {
00317 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
00318 char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
00319 mylocale);
00320 }
00321 }
00322 else
00323 prs->usewide = false;
00324 #endif
00325
00326 prs->state = newTParserPosition(NULL);
00327 prs->state->state = TPS_Base;
00328
00329 #ifdef WPARSER_TRACE
00330
00331
00332
00333
00334
00335
00336 fprintf(stderr, "parsing \"%.*s\"\n", len, str);
00337 #endif
00338
00339 return prs;
00340 }
00341
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353 static TParser *
00354 TParserCopyInit(const TParser *orig)
00355 {
00356 TParser *prs = (TParser *) palloc0(sizeof(TParser));
00357
00358 prs->charmaxlen = orig->charmaxlen;
00359 prs->str = orig->str + orig->state->posbyte;
00360 prs->lenstr = orig->lenstr - orig->state->posbyte;
00361
00362 #ifdef USE_WIDE_UPPER_LOWER
00363 prs->usewide = orig->usewide;
00364
00365 if (orig->pgwstr)
00366 prs->pgwstr = orig->pgwstr + orig->state->poschar;
00367 if (orig->wstr)
00368 prs->wstr = orig->wstr + orig->state->poschar;
00369 #endif
00370
00371 prs->state = newTParserPosition(NULL);
00372 prs->state->state = TPS_Base;
00373
00374 #ifdef WPARSER_TRACE
00375
00376 fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
00377 #endif
00378
00379 return prs;
00380 }
00381
00382
00383 static void
00384 TParserClose(TParser *prs)
00385 {
00386 while (prs->state)
00387 {
00388 TParserPosition *ptr = prs->state->prev;
00389
00390 pfree(prs->state);
00391 prs->state = ptr;
00392 }
00393
00394 #ifdef USE_WIDE_UPPER_LOWER
00395 if (prs->wstr)
00396 pfree(prs->wstr);
00397 if (prs->pgwstr)
00398 pfree(prs->pgwstr);
00399 #endif
00400
00401 #ifdef WPARSER_TRACE
00402 fprintf(stderr, "closing parser\n");
00403 #endif
00404 pfree(prs);
00405 }
00406
00407
00408
00409
00410 static void
00411 TParserCopyClose(TParser *prs)
00412 {
00413 while (prs->state)
00414 {
00415 TParserPosition *ptr = prs->state->prev;
00416
00417 pfree(prs->state);
00418 prs->state = ptr;
00419 }
00420
00421 #ifdef WPARSER_TRACE
00422 fprintf(stderr, "closing parser copy\n");
00423 #endif
00424 pfree(prs);
00425 }
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438 #ifdef USE_WIDE_UPPER_LOWER
00439
00440 #define p_iswhat(type) \
00441 static int \
00442 p_is##type(TParser *prs) { \
00443 Assert( prs->state ); \
00444 if ( prs->usewide ) \
00445 { \
00446 if ( prs->pgwstr ) \
00447 return is##type( 0xff & *( prs->pgwstr + prs->state->poschar) );\
00448 \
00449 return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \
00450 } \
00451 \
00452 return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
00453 } \
00454 \
00455 static int \
00456 p_isnot##type(TParser *prs) { \
00457 return !p_is##type(prs); \
00458 }
00459
00460 static int
00461 p_isalnum(TParser *prs)
00462 {
00463 Assert(prs->state);
00464
00465 if (prs->usewide)
00466 {
00467 if (prs->pgwstr)
00468 {
00469 unsigned int c = *(prs->pgwstr + prs->state->poschar);
00470
00471
00472
00473
00474
00475 if (c > 0x7f)
00476 return 1;
00477
00478 return isalnum(0xff & c);
00479 }
00480
00481 return iswalnum((wint_t) *(prs->wstr + prs->state->poschar));
00482 }
00483
00484 return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
00485 }
00486 static int
00487 p_isnotalnum(TParser *prs)
00488 {
00489 return !p_isalnum(prs);
00490 }
00491
00492 static int
00493 p_isalpha(TParser *prs)
00494 {
00495 Assert(prs->state);
00496
00497 if (prs->usewide)
00498 {
00499 if (prs->pgwstr)
00500 {
00501 unsigned int c = *(prs->pgwstr + prs->state->poschar);
00502
00503
00504
00505
00506
00507 if (c > 0x7f)
00508 return 1;
00509
00510 return isalpha(0xff & c);
00511 }
00512
00513 return iswalpha((wint_t) *(prs->wstr + prs->state->poschar));
00514 }
00515
00516 return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
00517 }
00518
00519 static int
00520 p_isnotalpha(TParser *prs)
00521 {
00522 return !p_isalpha(prs);
00523 }
00524
00525
00526
00527 static int
00528 p_iseq(TParser *prs, char c)
00529 {
00530 Assert(prs->state);
00531 return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
00532 }
00533 #else
00534
00535 #define p_iswhat(type) \
00536 static int \
00537 p_is##type(TParser *prs) { \
00538 Assert( prs->state ); \
00539 return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
00540 } \
00541 \
00542 static int \
00543 p_isnot##type(TParser *prs) { \
00544 return !p_is##type(prs); \
00545 }
00546
00547
00548 static int
00549 p_iseq(TParser *prs, char c)
00550 {
00551 Assert(prs->state);
00552 return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
00553 }
00554
00555 p_iswhat(alnum)
00556 p_iswhat(alpha)
00557 #endif
00558
00559 p_iswhat(digit)
00560 p_iswhat(lower)
00561 p_iswhat(print)
00562 p_iswhat(punct)
00563 p_iswhat(space)
00564 p_iswhat(upper)
00565 p_iswhat(xdigit)
00566
00567 static int
00568 p_isEOF(TParser *prs)
00569 {
00570 Assert(prs->state);
00571 return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
00572 }
00573
00574 static int
00575 p_iseqC(TParser *prs)
00576 {
00577 return p_iseq(prs, prs->c);
00578 }
00579
00580 static int
00581 p_isneC(TParser *prs)
00582 {
00583 return !p_iseq(prs, prs->c);
00584 }
00585
00586 static int
00587 p_isascii(TParser *prs)
00588 {
00589 return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
00590 }
00591
00592 static int
00593 p_isasclet(TParser *prs)
00594 {
00595 return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
00596 }
00597
00598 static int
00599 p_isurlchar(TParser *prs)
00600 {
00601 char ch;
00602
00603
00604 if (prs->state->charlen != 1)
00605 return 0;
00606 ch = *(prs->str + prs->state->posbyte);
00607
00608 if (ch <= 0x20 || ch >= 0x7F)
00609 return 0;
00610
00611 switch (ch)
00612 {
00613 case '"':
00614 case '<':
00615 case '>':
00616 case '\\':
00617 case '^':
00618 case '`':
00619 case '{':
00620 case '|':
00621 case '}':
00622 return 0;
00623 }
00624 return 1;
00625 }
00626
00627
00628
00629 void _make_compiler_happy(void);
00630 void
00631 _make_compiler_happy(void)
00632 {
00633 p_isalnum(NULL);
00634 p_isnotalnum(NULL);
00635 p_isalpha(NULL);
00636 p_isnotalpha(NULL);
00637 p_isdigit(NULL);
00638 p_isnotdigit(NULL);
00639 p_islower(NULL);
00640 p_isnotlower(NULL);
00641 p_isprint(NULL);
00642 p_isnotprint(NULL);
00643 p_ispunct(NULL);
00644 p_isnotpunct(NULL);
00645 p_isspace(NULL);
00646 p_isnotspace(NULL);
00647 p_isupper(NULL);
00648 p_isnotupper(NULL);
00649 p_isxdigit(NULL);
00650 p_isnotxdigit(NULL);
00651 p_isEOF(NULL);
00652 p_iseqC(NULL);
00653 p_isneC(NULL);
00654 }
00655
00656
00657 static void
00658 SpecialTags(TParser *prs)
00659 {
00660 switch (prs->state->lenchartoken)
00661 {
00662 case 8:
00663 if (pg_strncasecmp(prs->token, "</script", 8) == 0)
00664 prs->ignore = false;
00665 break;
00666 case 7:
00667 if (pg_strncasecmp(prs->token, "</style", 7) == 0)
00668 prs->ignore = false;
00669 else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
00670 prs->ignore = true;
00671 break;
00672 case 6:
00673 if (pg_strncasecmp(prs->token, "<style", 6) == 0)
00674 prs->ignore = true;
00675 break;
00676 default:
00677 break;
00678 }
00679 }
00680
00681 static void
00682 SpecialFURL(TParser *prs)
00683 {
00684 prs->wanthost = true;
00685 prs->state->posbyte -= prs->state->lenbytetoken;
00686 prs->state->poschar -= prs->state->lenchartoken;
00687 }
00688
00689 static void
00690 SpecialHyphen(TParser *prs)
00691 {
00692 prs->state->posbyte -= prs->state->lenbytetoken;
00693 prs->state->poschar -= prs->state->lenchartoken;
00694 }
00695
00696 static void
00697 SpecialVerVersion(TParser *prs)
00698 {
00699 prs->state->posbyte -= prs->state->lenbytetoken;
00700 prs->state->poschar -= prs->state->lenchartoken;
00701 prs->state->lenbytetoken = 0;
00702 prs->state->lenchartoken = 0;
00703 }
00704
00705 static int
00706 p_isstophost(TParser *prs)
00707 {
00708 if (prs->wanthost)
00709 {
00710 prs->wanthost = false;
00711 return 1;
00712 }
00713 return 0;
00714 }
00715
00716 static int
00717 p_isignore(TParser *prs)
00718 {
00719 return (prs->ignore) ? 1 : 0;
00720 }
00721
00722 static int
00723 p_ishost(TParser *prs)
00724 {
00725 TParser *tmpprs = TParserCopyInit(prs);
00726 int res = 0;
00727
00728 tmpprs->wanthost = true;
00729
00730 if (TParserGet(tmpprs) && tmpprs->type == HOST)
00731 {
00732 prs->state->posbyte += tmpprs->lenbytetoken;
00733 prs->state->poschar += tmpprs->lenchartoken;
00734 prs->state->lenbytetoken += tmpprs->lenbytetoken;
00735 prs->state->lenchartoken += tmpprs->lenchartoken;
00736 prs->state->charlen = tmpprs->state->charlen;
00737 res = 1;
00738 }
00739 TParserCopyClose(tmpprs);
00740
00741 return res;
00742 }
00743
00744 static int
00745 p_isURLPath(TParser *prs)
00746 {
00747 TParser *tmpprs = TParserCopyInit(prs);
00748 int res = 0;
00749
00750 tmpprs->state = newTParserPosition(tmpprs->state);
00751 tmpprs->state->state = TPS_InURLPathFirst;
00752
00753 if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
00754 {
00755 prs->state->posbyte += tmpprs->lenbytetoken;
00756 prs->state->poschar += tmpprs->lenchartoken;
00757 prs->state->lenbytetoken += tmpprs->lenbytetoken;
00758 prs->state->lenchartoken += tmpprs->lenchartoken;
00759 prs->state->charlen = tmpprs->state->charlen;
00760 res = 1;
00761 }
00762 TParserCopyClose(tmpprs);
00763
00764 return res;
00765 }
00766
00767
00768
00769
00770
00771
00772
00773 static int
00774 p_isspecial(TParser *prs)
00775 {
00776
00777
00778
00779 if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
00780 return 1;
00781
00782 #ifdef USE_WIDE_UPPER_LOWER
00783
00784
00785
00786
00787
00788
00789
00790 if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
00791 {
00792 static pg_wchar strange_letter[] = {
00793
00794
00795
00796 0x0903,
00797 0x093E,
00798 0x093F,
00799 0x0940,
00800 0x0949,
00801 0x094A,
00802 0x094B,
00803 0x094C,
00804 0x0982,
00805 0x0983,
00806 0x09BE,
00807 0x09BF,
00808 0x09C0,
00809 0x09C7,
00810 0x09C8,
00811 0x09CB,
00812 0x09CC,
00813 0x09D7,
00814 0x0A03,
00815 0x0A3E,
00816 0x0A3F,
00817 0x0A40,
00818 0x0A83,
00819 0x0ABE,
00820 0x0ABF,
00821 0x0AC0,
00822 0x0AC9,
00823 0x0ACB,
00824 0x0ACC,
00825 0x0B02,
00826 0x0B03,
00827 0x0B3E,
00828 0x0B40,
00829 0x0B47,
00830 0x0B48,
00831 0x0B4B,
00832 0x0B4C,
00833 0x0B57,
00834 0x0BBE,
00835 0x0BBF,
00836 0x0BC1,
00837 0x0BC2,
00838 0x0BC6,
00839 0x0BC7,
00840 0x0BC8,
00841 0x0BCA,
00842 0x0BCB,
00843 0x0BCC,
00844 0x0BD7,
00845 0x0C01,
00846 0x0C02,
00847 0x0C03,
00848 0x0C41,
00849 0x0C42,
00850 0x0C43,
00851 0x0C44,
00852 0x0C82,
00853 0x0C83,
00854 0x0CBE,
00855 0x0CC0,
00856 0x0CC1,
00857 0x0CC2,
00858 0x0CC3,
00859 0x0CC4,
00860 0x0CC7,
00861 0x0CC8,
00862 0x0CCA,
00863 0x0CCB,
00864 0x0CD5,
00865 0x0CD6,
00866 0x0D02,
00867 0x0D03,
00868 0x0D3E,
00869 0x0D3F,
00870 0x0D40,
00871 0x0D46,
00872 0x0D47,
00873 0x0D48,
00874 0x0D4A,
00875 0x0D4B,
00876 0x0D4C,
00877 0x0D57,
00878 0x0D82,
00879 0x0D83,
00880 0x0DCF,
00881 0x0DD0,
00882 0x0DD1,
00883 0x0DD8,
00884 0x0DD9,
00885 0x0DDA,
00886 0x0DDB,
00887 0x0DDC,
00888 0x0DDD,
00889
00890 0x0DDE,
00891 0x0DDF,
00892 0x0DF2,
00893 0x0DF3,
00894 0x0F3E,
00895 0x0F3F,
00896 0x0F7F,
00897 0x102B,
00898 0x102C,
00899 0x1031,
00900 0x1038,
00901 0x103B,
00902 0x103C,
00903 0x1056,
00904 0x1057,
00905 0x1062,
00906 0x1063,
00907 0x1064,
00908 0x1067,
00909 0x1068,
00910 0x1069,
00911 0x106A,
00912 0x106B,
00913 0x106C,
00914 0x106D,
00915 0x1083,
00916 0x1084,
00917 0x1087,
00918 0x1088,
00919 0x1089,
00920 0x108A,
00921 0x108B,
00922 0x108C,
00923 0x108F,
00924 0x17B6,
00925 0x17BE,
00926 0x17BF,
00927 0x17C0,
00928 0x17C1,
00929 0x17C2,
00930 0x17C3,
00931 0x17C4,
00932 0x17C5,
00933 0x17C7,
00934 0x17C8,
00935 0x1923,
00936 0x1924,
00937 0x1925,
00938 0x1926,
00939 0x1929,
00940 0x192A,
00941 0x192B,
00942 0x1930,
00943 0x1931,
00944 0x1933,
00945 0x1934,
00946 0x1935,
00947 0x1936,
00948 0x1937,
00949 0x1938,
00950 0x19B0,
00951 0x19B1,
00952 0x19B2,
00953 0x19B3,
00954 0x19B4,
00955 0x19B5,
00956 0x19B6,
00957 0x19B7,
00958 0x19B8,
00959 0x19B9,
00960 0x19BA,
00961 0x19BB,
00962 0x19BC,
00963 0x19BD,
00964 0x19BE,
00965 0x19BF,
00966 0x19C0,
00967 0x19C8,
00968 0x19C9,
00969 0x1A19,
00970 0x1A1A,
00971 0x1A1B,
00972 0x1B04,
00973 0x1B35,
00974 0x1B3B,
00975 0x1B3D,
00976 0x1B3E,
00977 0x1B3F,
00978 0x1B40,
00979 0x1B41,
00980 0x1B43,
00981 0x1B44,
00982 0x1B82,
00983 0x1BA1,
00984 0x1BA6,
00985 0x1BA7,
00986 0x1BAA,
00987 0x1C24,
00988 0x1C25,
00989 0x1C26,
00990 0x1C27,
00991 0x1C28,
00992 0x1C29,
00993 0x1C2A,
00994 0x1C2B,
00995 0x1C34,
00996 0x1C35,
00997 0xA823,
00998 0xA824,
00999 0xA827,
01000 0xA880,
01001 0xA881,
01002 0xA8B4,
01003 0xA8B5,
01004 0xA8B6,
01005 0xA8B7,
01006 0xA8B8,
01007 0xA8B9,
01008 0xA8BA,
01009 0xA8BB,
01010 0xA8BC,
01011 0xA8BD,
01012 0xA8BE,
01013 0xA8BF,
01014 0xA8C0,
01015 0xA8C1,
01016 0xA8C2,
01017 0xA8C3,
01018 0xA952,
01019 0xA953,
01020 0xAA2F,
01021 0xAA30,
01022 0xAA33,
01023 0xAA34,
01024 0xAA4D
01025 };
01026 pg_wchar *StopLow = strange_letter,
01027 *StopHigh = strange_letter + lengthof(strange_letter),
01028 *StopMiddle;
01029 pg_wchar c;
01030
01031 if (prs->pgwstr)
01032 c = *(prs->pgwstr + prs->state->poschar);
01033 else
01034 c = (pg_wchar) *(prs->wstr + prs->state->poschar);
01035
01036 while (StopLow < StopHigh)
01037 {
01038 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
01039 if (*StopMiddle == c)
01040 return 1;
01041 else if (*StopMiddle < c)
01042 StopLow = StopMiddle + 1;
01043 else
01044 StopHigh = StopMiddle;
01045 }
01046 }
01047 #endif
01048
01049 return 0;
01050 }
01051
01052
01053
01054
01055
01056 static const TParserStateActionItem actionTPS_Base[] = {
01057 {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
01058 {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
01059 {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
01060 {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
01061 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
01062 {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
01063 {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
01064 {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
01065 {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
01066 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
01067 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
01068 {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
01069 {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
01070 };
01071
01072
01073 static const TParserStateActionItem actionTPS_InNumWord[] = {
01074 {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
01075 {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
01076 {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
01077 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
01078 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
01079 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
01080 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
01081 {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
01082 };
01083
01084 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
01085 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
01086 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
01087 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
01088 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
01089 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01090 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
01091 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01092 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
01093 {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
01094 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
01095 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
01096 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
01097 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
01098 {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
01099 {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
01100 };
01101
01102 static const TParserStateActionItem actionTPS_InWord[] = {
01103 {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
01104 {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
01105 {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
01106 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
01107 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
01108 {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
01109 };
01110
01111 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
01112 {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
01113 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
01114 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
01115 {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
01116 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01117 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01118 {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
01119 {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
01120 {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
01121 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
01122 {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
01123 };
01124
01125 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
01126 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01127 {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
01128 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01129 };
01130
01131 static const TParserStateActionItem actionTPS_InSignedInt[] = {
01132 {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
01133 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
01134 {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
01135 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01136 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01137 {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
01138 };
01139
01140 static const TParserStateActionItem actionTPS_InSpace[] = {
01141 {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
01142 {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
01143 {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
01144 {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
01145 {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
01146 {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
01147 {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
01148 {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
01149 {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
01150 };
01151
01152 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
01153 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01154 {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
01155 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01156 };
01157
01158 static const TParserStateActionItem actionTPS_InUDecimal[] = {
01159 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
01160 {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
01161 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
01162 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01163 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01164 {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
01165 };
01166
01167 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
01168 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01169 {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
01170 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01171 };
01172
01173 static const TParserStateActionItem actionTPS_InDecimal[] = {
01174 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
01175 {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
01176 {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
01177 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01178 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
01179 {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
01180 };
01181
01182 static const TParserStateActionItem actionTPS_InVerVersion[] = {
01183 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01184 {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
01185 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01186 };
01187
01188 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
01189 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01190 {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
01191 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
01192 };
01193
01194
01195 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
01196 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01197 {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
01198 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01199 };
01200
01201 static const TParserStateActionItem actionTPS_InVersion[] = {
01202 {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
01203 {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
01204 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
01205 {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
01206 };
01207
01208 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
01209 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01210 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
01211 {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
01212 {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
01213 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01214 };
01215
01216 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
01217 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01218 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
01219 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01220 };
01221
01222 static const TParserStateActionItem actionTPS_InMantissa[] = {
01223 {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
01224 {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
01225 {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
01226 };
01227
01228 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
01229 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01230 {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
01231 {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
01232 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
01233 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
01234 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01235 };
01236
01237 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
01238 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01239 {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
01240 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
01241 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
01242 {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
01243 {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
01244 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
01245 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01246 };
01247
01248 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
01249 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01250 {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
01251 {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
01252 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
01253 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01254 };
01255
01256 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
01257 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01258 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
01259 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01260 };
01261
01262 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
01263 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01264 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
01265 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
01266 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01267 };
01268
01269 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
01270 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01271 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
01272 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
01273 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01274 };
01275
01276 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
01277 {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
01278 };
01279
01280 static const TParserStateActionItem actionTPS_InTagFirst[] = {
01281 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01282 {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
01283 {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
01284 {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
01285 {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
01286 {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
01287 {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
01288 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01289 };
01290
01291 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
01292 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01293
01294
01295 {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
01296 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01297 };
01298
01299 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
01300 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01301 {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
01302 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01303 };
01304
01305 static const TParserStateActionItem actionTPS_InTagName[] = {
01306 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01307
01308 {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
01309 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
01310 {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
01311 {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
01312 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
01313 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
01314 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
01315 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
01316 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01317 };
01318
01319 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
01320 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01321 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
01322 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01323 };
01324
01325 static const TParserStateActionItem actionTPS_InTag[] = {
01326 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01327 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
01328 {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
01329 {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
01330 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
01331 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
01332 {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
01333 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
01334 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
01335 {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
01336 {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
01337 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
01338 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
01339 {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
01340 {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
01341 {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
01342 {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
01343 {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
01344 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01345 };
01346
01347 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
01348 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01349 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
01350 {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
01351 {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
01352 };
01353
01354 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
01355 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01356 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
01357 {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
01358 {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
01359 };
01360
01361 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
01362 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01363 {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
01364 };
01365
01366 static const TParserStateActionItem actionTPS_InTagEnd[] = {
01367 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
01368 };
01369
01370 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
01371 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01372 {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
01373
01374 {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
01375 {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
01376 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01377 };
01378
01379 static const TParserStateActionItem actionTPS_InCommentLast[] = {
01380 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01381 {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
01382 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01383 };
01384
01385 static const TParserStateActionItem actionTPS_InComment[] = {
01386 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01387 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
01388 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
01389 };
01390
01391 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
01392 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01393 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
01394 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
01395 };
01396
01397 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
01398 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01399 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
01400 {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
01401 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
01402 };
01403
01404 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
01405 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
01406 };
01407
01408 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
01409 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01410 {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
01411 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
01412 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01413 };
01414
01415 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
01416 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01417 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
01418 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
01419 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01420 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01421 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
01422 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
01423 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01424 };
01425
01426 static const TParserStateActionItem actionTPS_InHostDomain[] = {
01427 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
01428 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
01429 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
01430 {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
01431 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01432 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01433 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
01434 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
01435 {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
01436 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
01437 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
01438 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
01439 };
01440
01441 static const TParserStateActionItem actionTPS_InPortFirst[] = {
01442 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01443 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
01444 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01445 };
01446
01447 static const TParserStateActionItem actionTPS_InPort[] = {
01448 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
01449 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
01450 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
01451 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
01452 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
01453 };
01454
01455 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
01456 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01457 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
01458 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
01459 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01460 };
01461
01462 static const TParserStateActionItem actionTPS_InHost[] = {
01463 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01464 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
01465 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
01466 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
01467 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
01468 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01469 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
01470 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01471 };
01472
01473 static const TParserStateActionItem actionTPS_InEmail[] = {
01474 {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
01475 {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
01476 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01477 };
01478
01479 static const TParserStateActionItem actionTPS_InFileFirst[] = {
01480 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01481 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
01482 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
01483 {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
01484 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
01485 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
01486 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01487 };
01488
01489 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
01490 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01491 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
01492 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
01493 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
01494 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
01495 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01496 };
01497
01498 static const TParserStateActionItem actionTPS_InPathFirst[] = {
01499 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01500 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
01501 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
01502 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
01503 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
01504 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
01505 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01506 };
01507
01508 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
01509 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01510 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
01511 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
01512 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01513 };
01514
01515 static const TParserStateActionItem actionTPS_InPathSecond[] = {
01516 {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
01517 {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
01518 {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
01519 {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
01520 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01521 };
01522
01523 static const TParserStateActionItem actionTPS_InFile[] = {
01524 {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
01525 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
01526 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
01527 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
01528 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
01529 {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
01530 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
01531 {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
01532 };
01533
01534 static const TParserStateActionItem actionTPS_InFileNext[] = {
01535 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01536 {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
01537 {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
01538 {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
01539 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01540 };
01541
01542 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
01543 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01544 {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
01545 {NULL, 0, A_POP, TPS_Null, 0, NULL},
01546 };
01547
01548 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
01549 {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
01550 };
01551
01552 static const TParserStateActionItem actionTPS_InURLPath[] = {
01553 {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
01554 {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
01555 {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
01556 };
01557
01558 static const TParserStateActionItem actionTPS_InFURL[] = {
01559 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01560 {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
01561 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01562 };
01563
01564 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
01565 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01566 {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
01567 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01568 };
01569
01570 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
01571 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01572 {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
01573 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01574 };
01575
01576 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
01577 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
01578 };
01579
01580 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
01581 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01582 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
01583 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
01584 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
01585 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01586 };
01587
01588 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
01589 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
01590 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
01591 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
01592 {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
01593 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01594 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
01595 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
01596 };
01597
01598 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
01599 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01600 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
01601 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
01602 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01603 };
01604
01605 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
01606 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
01607 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
01608 {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
01609 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01610 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
01611 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
01612 };
01613
01614 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
01615 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01616 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01617 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
01618 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01619 };
01620
01621 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
01622 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
01623 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01624 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01625 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
01626 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
01627 };
01628
01629 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
01630 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01631 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
01632 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01633 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
01634 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01635 };
01636
01637 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
01638 {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
01639 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
01640 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
01641 {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
01642 {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
01643 {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
01644 };
01645
01646 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
01647 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01648 {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
01649 {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
01650 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01651 };
01652
01653 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
01654 {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
01655 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
01656 {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
01657 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
01658 {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
01659 };
01660
01661 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
01662 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
01663 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
01664 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
01665 {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
01666 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
01667 {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
01668 };
01669
01670 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
01671 {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
01672 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
01673 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
01674 {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
01675 };
01676
01677 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
01678 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
01679 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
01680 {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
01681 {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
01682 {NULL, 0, A_POP, TPS_Null, 0, NULL}
01683 };
01684
01685
01686
01687
01688
01689 typedef struct
01690 {
01691 const TParserStateActionItem *action;
01692 TParserState state;
01693 #ifdef WPARSER_TRACE
01694 const char *state_name;
01695 #endif
01696 } TParserStateAction;
01697
01698 #ifdef WPARSER_TRACE
01699 #define TPARSERSTATEACTION(state) \
01700 { CppConcat(action,state), state, CppAsString(state) }
01701 #else
01702 #define TPARSERSTATEACTION(state) \
01703 { CppConcat(action,state), state }
01704 #endif
01705
01706
01707
01708
01709
01710 static const TParserStateAction Actions[] = {
01711 TPARSERSTATEACTION(TPS_Base),
01712 TPARSERSTATEACTION(TPS_InNumWord),
01713 TPARSERSTATEACTION(TPS_InAsciiWord),
01714 TPARSERSTATEACTION(TPS_InWord),
01715 TPARSERSTATEACTION(TPS_InUnsignedInt),
01716 TPARSERSTATEACTION(TPS_InSignedIntFirst),
01717 TPARSERSTATEACTION(TPS_InSignedInt),
01718 TPARSERSTATEACTION(TPS_InSpace),
01719 TPARSERSTATEACTION(TPS_InUDecimalFirst),
01720 TPARSERSTATEACTION(TPS_InUDecimal),
01721 TPARSERSTATEACTION(TPS_InDecimalFirst),
01722 TPARSERSTATEACTION(TPS_InDecimal),
01723 TPARSERSTATEACTION(TPS_InVerVersion),
01724 TPARSERSTATEACTION(TPS_InSVerVersion),
01725 TPARSERSTATEACTION(TPS_InVersionFirst),
01726 TPARSERSTATEACTION(TPS_InVersion),
01727 TPARSERSTATEACTION(TPS_InMantissaFirst),
01728 TPARSERSTATEACTION(TPS_InMantissaSign),
01729 TPARSERSTATEACTION(TPS_InMantissa),
01730 TPARSERSTATEACTION(TPS_InXMLEntityFirst),
01731 TPARSERSTATEACTION(TPS_InXMLEntity),
01732 TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
01733 TPARSERSTATEACTION(TPS_InXMLEntityNum),
01734 TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
01735 TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
01736 TPARSERSTATEACTION(TPS_InXMLEntityEnd),
01737 TPARSERSTATEACTION(TPS_InTagFirst),
01738 TPARSERSTATEACTION(TPS_InXMLBegin),
01739 TPARSERSTATEACTION(TPS_InTagCloseFirst),
01740 TPARSERSTATEACTION(TPS_InTagName),
01741 TPARSERSTATEACTION(TPS_InTagBeginEnd),
01742 TPARSERSTATEACTION(TPS_InTag),
01743 TPARSERSTATEACTION(TPS_InTagEscapeK),
01744 TPARSERSTATEACTION(TPS_InTagEscapeKK),
01745 TPARSERSTATEACTION(TPS_InTagBackSleshed),
01746 TPARSERSTATEACTION(TPS_InTagEnd),
01747 TPARSERSTATEACTION(TPS_InCommentFirst),
01748 TPARSERSTATEACTION(TPS_InCommentLast),
01749 TPARSERSTATEACTION(TPS_InComment),
01750 TPARSERSTATEACTION(TPS_InCloseCommentFirst),
01751 TPARSERSTATEACTION(TPS_InCloseCommentLast),
01752 TPARSERSTATEACTION(TPS_InCommentEnd),
01753 TPARSERSTATEACTION(TPS_InHostFirstDomain),
01754 TPARSERSTATEACTION(TPS_InHostDomainSecond),
01755 TPARSERSTATEACTION(TPS_InHostDomain),
01756 TPARSERSTATEACTION(TPS_InPortFirst),
01757 TPARSERSTATEACTION(TPS_InPort),
01758 TPARSERSTATEACTION(TPS_InHostFirstAN),
01759 TPARSERSTATEACTION(TPS_InHost),
01760 TPARSERSTATEACTION(TPS_InEmail),
01761 TPARSERSTATEACTION(TPS_InFileFirst),
01762 TPARSERSTATEACTION(TPS_InFileTwiddle),
01763 TPARSERSTATEACTION(TPS_InPathFirst),
01764 TPARSERSTATEACTION(TPS_InPathFirstFirst),
01765 TPARSERSTATEACTION(TPS_InPathSecond),
01766 TPARSERSTATEACTION(TPS_InFile),
01767 TPARSERSTATEACTION(TPS_InFileNext),
01768 TPARSERSTATEACTION(TPS_InURLPathFirst),
01769 TPARSERSTATEACTION(TPS_InURLPathStart),
01770 TPARSERSTATEACTION(TPS_InURLPath),
01771 TPARSERSTATEACTION(TPS_InFURL),
01772 TPARSERSTATEACTION(TPS_InProtocolFirst),
01773 TPARSERSTATEACTION(TPS_InProtocolSecond),
01774 TPARSERSTATEACTION(TPS_InProtocolEnd),
01775 TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
01776 TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
01777 TPARSERSTATEACTION(TPS_InHyphenWordFirst),
01778 TPARSERSTATEACTION(TPS_InHyphenWord),
01779 TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
01780 TPARSERSTATEACTION(TPS_InHyphenNumWord),
01781 TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
01782 TPARSERSTATEACTION(TPS_InParseHyphen),
01783 TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
01784 TPARSERSTATEACTION(TPS_InHyphenWordPart),
01785 TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
01786 TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
01787 TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
01788 };
01789
01790
01791 static bool
01792 TParserGet(TParser *prs)
01793 {
01794 const TParserStateActionItem *item = NULL;
01795
01796 Assert(prs->state);
01797
01798 if (prs->state->posbyte >= prs->lenstr)
01799 return false;
01800
01801 prs->token = prs->str + prs->state->posbyte;
01802 prs->state->pushedAtAction = NULL;
01803
01804
01805 while (prs->state->posbyte <= prs->lenstr)
01806 {
01807 if (prs->state->posbyte == prs->lenstr)
01808 prs->state->charlen = 0;
01809 else
01810 prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
01811 pg_mblen(prs->str + prs->state->posbyte);
01812
01813 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
01814 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
01815 Assert(Actions[prs->state->state].state == prs->state->state);
01816
01817 if (prs->state->pushedAtAction)
01818 {
01819
01820 item = prs->state->pushedAtAction + 1;
01821 prs->state->pushedAtAction = NULL;
01822 }
01823 else
01824 {
01825 item = Actions[prs->state->state].action;
01826 Assert(item != NULL);
01827 }
01828
01829
01830 while (item->isclass)
01831 {
01832 prs->c = item->c;
01833 if (item->isclass(prs) != 0)
01834 break;
01835 item++;
01836 }
01837
01838 #ifdef WPARSER_TRACE
01839 {
01840 TParserPosition *ptr;
01841
01842 fprintf(stderr, "state ");
01843
01844 for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
01845 fprintf(stderr, " ");
01846 fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
01847 if (prs->state->posbyte < prs->lenstr)
01848 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
01849 else
01850 fprintf(stderr, "at EOF");
01851 fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
01852 (int) (item - Actions[prs->state->state].action),
01853 (item->flags & A_BINGO) ? " BINGO" : "",
01854 (item->flags & A_POP) ? " POP" : "",
01855 (item->flags & A_PUSH) ? " PUSH" : "",
01856 (item->flags & A_RERUN) ? " RERUN" : "",
01857 (item->flags & A_CLEAR) ? " CLEAR" : "",
01858 (item->flags & A_MERGE) ? " MERGE" : "",
01859 (item->flags & A_CLRALL) ? " CLRALL" : "",
01860 (item->tostate != TPS_Null) ? " tostate " : "",
01861 (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
01862 (item->type > 0) ? " type " : "",
01863 tok_alias[item->type]);
01864 }
01865 #endif
01866
01867
01868 if (item->special)
01869 item->special(prs);
01870
01871
01872 if (item->flags & A_BINGO)
01873 {
01874 Assert(item->type > 0);
01875 prs->lenbytetoken = prs->state->lenbytetoken;
01876 prs->lenchartoken = prs->state->lenchartoken;
01877 prs->state->lenbytetoken = prs->state->lenchartoken = 0;
01878 prs->type = item->type;
01879 }
01880
01881
01882 if (item->flags & A_POP)
01883 {
01884 TParserPosition *ptr = prs->state->prev;
01885
01886 pfree(prs->state);
01887 prs->state = ptr;
01888 Assert(prs->state);
01889 }
01890 else if (item->flags & A_PUSH)
01891 {
01892 prs->state->pushedAtAction = item;
01893 prs->state = newTParserPosition(prs->state);
01894 }
01895 else if (item->flags & A_CLEAR)
01896 {
01897 TParserPosition *ptr;
01898
01899 Assert(prs->state->prev);
01900 ptr = prs->state->prev->prev;
01901 pfree(prs->state->prev);
01902 prs->state->prev = ptr;
01903 }
01904 else if (item->flags & A_CLRALL)
01905 {
01906 TParserPosition *ptr;
01907
01908 while (prs->state->prev)
01909 {
01910 ptr = prs->state->prev->prev;
01911 pfree(prs->state->prev);
01912 prs->state->prev = ptr;
01913 }
01914 }
01915 else if (item->flags & A_MERGE)
01916 {
01917 TParserPosition *ptr = prs->state;
01918
01919 Assert(prs->state->prev);
01920 prs->state = prs->state->prev;
01921
01922 prs->state->posbyte = ptr->posbyte;
01923 prs->state->poschar = ptr->poschar;
01924 prs->state->charlen = ptr->charlen;
01925 prs->state->lenbytetoken = ptr->lenbytetoken;
01926 prs->state->lenchartoken = ptr->lenchartoken;
01927 pfree(ptr);
01928 }
01929
01930
01931 if (item->tostate != TPS_Null)
01932 prs->state->state = item->tostate;
01933
01934
01935 if ((item->flags & A_BINGO) ||
01936 (prs->state->posbyte >= prs->lenstr &&
01937 (item->flags & A_RERUN) == 0))
01938 break;
01939
01940
01941 if (item->flags & (A_RERUN | A_POP))
01942 continue;
01943
01944
01945 if (prs->state->charlen)
01946 {
01947 prs->state->posbyte += prs->state->charlen;
01948 prs->state->lenbytetoken += prs->state->charlen;
01949 prs->state->poschar++;
01950 prs->state->lenchartoken++;
01951 }
01952 }
01953
01954 return (item && (item->flags & A_BINGO)) ? true : false;
01955 }
01956
01957 Datum
01958 prsd_lextype(PG_FUNCTION_ARGS)
01959 {
01960 LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
01961 int i;
01962
01963 for (i = 1; i <= LASTNUM; i++)
01964 {
01965 descr[i - 1].lexid = i;
01966 descr[i - 1].alias = pstrdup(tok_alias[i]);
01967 descr[i - 1].descr = pstrdup(lex_descr[i]);
01968 }
01969
01970 descr[LASTNUM].lexid = 0;
01971
01972 PG_RETURN_POINTER(descr);
01973 }
01974
01975 Datum
01976 prsd_start(PG_FUNCTION_ARGS)
01977 {
01978 PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
01979 }
01980
01981 Datum
01982 prsd_nexttoken(PG_FUNCTION_ARGS)
01983 {
01984 TParser *p = (TParser *) PG_GETARG_POINTER(0);
01985 char **t = (char **) PG_GETARG_POINTER(1);
01986 int *tlen = (int *) PG_GETARG_POINTER(2);
01987
01988 if (!TParserGet(p))
01989 PG_RETURN_INT32(0);
01990
01991 *t = p->token;
01992 *tlen = p->lenbytetoken;
01993
01994 PG_RETURN_INT32(p->type);
01995 }
01996
01997 Datum
01998 prsd_end(PG_FUNCTION_ARGS)
01999 {
02000 TParser *p = (TParser *) PG_GETARG_POINTER(0);
02001
02002 TParserClose(p);
02003 PG_RETURN_VOID();
02004 }
02005
02006 #define LEAVETOKEN(x) ( (x)==SPACE )
02007 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
02008 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
02009
02010 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
02011 #define HLIDREPLACE(x) ( (x)==TAG_T )
02012 #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
02013 #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
02014 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
02015 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
02016
02017 typedef struct
02018 {
02019 HeadlineWordEntry *words;
02020 int len;
02021 } hlCheck;
02022
02023 static bool
02024 checkcondition_HL(void *checkval, QueryOperand *val)
02025 {
02026 int i;
02027
02028 for (i = 0; i < ((hlCheck *) checkval)->len; i++)
02029 {
02030 if (((hlCheck *) checkval)->words[i].item == val)
02031 return true;
02032 }
02033 return false;
02034 }
02035
02036
02037 static bool
02038 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
02039 {
02040 int i,
02041 j;
02042 QueryItem *item = GETQUERY(query);
02043 int pos = *p;
02044
02045 *q = -1;
02046 *p = 0x7fffffff;
02047
02048 for (j = 0; j < query->size; j++)
02049 {
02050 if (item->type != QI_VAL)
02051 {
02052 item++;
02053 continue;
02054 }
02055 for (i = pos; i < prs->curwords; i++)
02056 {
02057 if (prs->words[i].item == &item->qoperand)
02058 {
02059 if (i > *q)
02060 *q = i;
02061 break;
02062 }
02063 }
02064 item++;
02065 }
02066
02067 if (*q < 0)
02068 return false;
02069
02070 item = GETQUERY(query);
02071 for (j = 0; j < query->size; j++)
02072 {
02073 if (item->type != QI_VAL)
02074 {
02075 item++;
02076 continue;
02077 }
02078 for (i = *q; i >= pos; i--)
02079 {
02080 if (prs->words[i].item == &item->qoperand)
02081 {
02082 if (i < *p)
02083 *p = i;
02084 break;
02085 }
02086 }
02087 item++;
02088 }
02089
02090 if (*p <= *q)
02091 {
02092 hlCheck ch;
02093
02094 ch.words = &(prs->words[*p]);
02095 ch.len = *q - *p + 1;
02096 if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
02097 return true;
02098 else
02099 {
02100 (*p)++;
02101 return hlCover(prs, query, p, q);
02102 }
02103 }
02104
02105 return false;
02106 }
02107
02108 static void
02109 mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
02110 {
02111 int i;
02112
02113 for (i = startpos; i <= endpos; i++)
02114 {
02115 if (prs->words[i].item)
02116 prs->words[i].selected = 1;
02117 if (highlight == 0)
02118 {
02119 if (HLIDREPLACE(prs->words[i].type))
02120 prs->words[i].replace = 1;
02121 else if (HLIDSKIP(prs->words[i].type))
02122 prs->words[i].skip = 1;
02123 }
02124 else
02125 {
02126 if (XMLHLIDSKIP(prs->words[i].type))
02127 prs->words[i].skip = 1;
02128 }
02129
02130 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
02131 }
02132 }
02133
02134 typedef struct
02135 {
02136 int32 startpos;
02137 int32 endpos;
02138 int32 poslen;
02139 int32 curlen;
02140 int16 in;
02141 int16 excluded;
02142 } CoverPos;
02143
02144 static void
02145 get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
02146 int *curlen, int *poslen, int max_words)
02147 {
02148 int i;
02149
02150
02151
02152
02153
02154
02155
02156
02157
02158 for (i = *startpos; i <= *endpos; i++)
02159 {
02160 *startpos = i;
02161 if (prs->words[i].item && !prs->words[i].repeated)
02162 break;
02163 }
02164
02165 *curlen = 0;
02166 *poslen = 0;
02167 for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
02168 {
02169 if (!NONWORDTOKEN(prs->words[i].type))
02170 *curlen += 1;
02171 if (prs->words[i].item && !prs->words[i].repeated)
02172 *poslen += 1;
02173 }
02174
02175 if (*endpos > i)
02176 {
02177 *endpos = i;
02178 for (i = *endpos; i >= *startpos; i--)
02179 {
02180 *endpos = i;
02181 if (prs->words[i].item && !prs->words[i].repeated)
02182 break;
02183 if (!NONWORDTOKEN(prs->words[i].type))
02184 *curlen -= 1;
02185 }
02186 }
02187 }
02188
02189 static void
02190 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
02191 int shortword, int min_words,
02192 int max_words, int max_fragments)
02193 {
02194 int32 poslen,
02195 curlen,
02196 i,
02197 f,
02198 num_f = 0;
02199 int32 stretch,
02200 maxstretch,
02201 posmarker;
02202
02203 int32 startpos = 0,
02204 endpos = 0,
02205 p = 0,
02206 q = 0;
02207
02208 int32 numcovers = 0,
02209 maxcovers = 32;
02210
02211 int32 minI,
02212 minwords,
02213 maxitems;
02214 CoverPos *covers;
02215
02216 covers = palloc(maxcovers * sizeof(CoverPos));
02217
02218
02219 while (hlCover(prs, query, &p, &q))
02220 {
02221 startpos = p;
02222 endpos = q;
02223
02224
02225
02226
02227
02228
02229
02230
02231 while (startpos <= endpos)
02232 {
02233 get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
02234 if (numcovers >= maxcovers)
02235 {
02236 maxcovers *= 2;
02237 covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
02238 }
02239 covers[numcovers].startpos = startpos;
02240 covers[numcovers].endpos = endpos;
02241 covers[numcovers].curlen = curlen;
02242 covers[numcovers].poslen = poslen;
02243 covers[numcovers].in = 0;
02244 covers[numcovers].excluded = 0;
02245 numcovers++;
02246 startpos = endpos + 1;
02247 endpos = q;
02248 }
02249
02250 p++;
02251 }
02252
02253
02254 for (f = 0; f < max_fragments; f++)
02255 {
02256 maxitems = 0;
02257 minwords = 0x7fffffff;
02258 minI = -1;
02259
02260
02261
02262
02263
02264 for (i = 0; i < numcovers; i++)
02265 {
02266 if (!covers[i].in && !covers[i].excluded &&
02267 (maxitems < covers[i].poslen || (maxitems == covers[i].poslen
02268 && minwords > covers[i].curlen)))
02269 {
02270 maxitems = covers[i].poslen;
02271 minwords = covers[i].curlen;
02272 minI = i;
02273 }
02274 }
02275
02276 if (minI >= 0)
02277 {
02278 covers[minI].in = 1;
02279
02280 startpos = covers[minI].startpos;
02281 endpos = covers[minI].endpos;
02282 curlen = covers[minI].curlen;
02283
02284 if (curlen < max_words)
02285 {
02286
02287 maxstretch = (max_words - curlen) / 2;
02288
02289
02290
02291
02292
02293
02294 stretch = 0;
02295 posmarker = startpos;
02296 for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
02297 {
02298 if (!NONWORDTOKEN(prs->words[i].type))
02299 {
02300 curlen++;
02301 stretch++;
02302 }
02303 posmarker = i;
02304 }
02305
02306 for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
02307 {
02308 if (!NONWORDTOKEN(prs->words[i].type))
02309 curlen--;
02310 }
02311 startpos = i;
02312
02313 posmarker = endpos;
02314 for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
02315 {
02316 if (!NONWORDTOKEN(prs->words[i].type))
02317 curlen++;
02318 posmarker = i;
02319 }
02320
02321 for (i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
02322 {
02323 if (!NONWORDTOKEN(prs->words[i].type))
02324 curlen--;
02325 }
02326 endpos = i;
02327 }
02328 covers[minI].startpos = startpos;
02329 covers[minI].endpos = endpos;
02330 covers[minI].curlen = curlen;
02331
02332 mark_fragment(prs, highlight, startpos, endpos);
02333 num_f++;
02334
02335 for (i = 0; i < numcovers; i++)
02336 {
02337 if (i != minI && ((covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
02338 covers[i].excluded = 1;
02339 }
02340 }
02341 else
02342 break;
02343 }
02344
02345
02346 if (num_f <= 0)
02347 {
02348 startpos = endpos = curlen = 0;
02349 for (i = 0; i < prs->curwords && curlen < min_words; i++)
02350 {
02351 if (!NONWORDTOKEN(prs->words[i].type))
02352 curlen++;
02353 endpos = i;
02354 }
02355 mark_fragment(prs, highlight, startpos, endpos);
02356 }
02357 pfree(covers);
02358 }
02359
02360 static void
02361 mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
02362 int shortword, int min_words, int max_words)
02363 {
02364 int p = 0,
02365 q = 0;
02366 int bestb = -1,
02367 beste = -1;
02368 int bestlen = -1;
02369 int pose = 0,
02370 posb,
02371 poslen,
02372 curlen;
02373
02374 int i;
02375
02376 if (highlight == 0)
02377 {
02378 while (hlCover(prs, query, &p, &q))
02379 {
02380
02381 curlen = 0;
02382 poslen = 0;
02383 for (i = p; i <= q && curlen < max_words; i++)
02384 {
02385 if (!NONWORDTOKEN(prs->words[i].type))
02386 curlen++;
02387 if (prs->words[i].item && !prs->words[i].repeated)
02388 poslen++;
02389 pose = i;
02390 }
02391
02392 if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
02393 {
02394
02395 p++;
02396 continue;
02397 }
02398
02399 posb = p;
02400 if (curlen < max_words)
02401 {
02402 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
02403 {
02404 if (i != q)
02405 {
02406 if (!NONWORDTOKEN(prs->words[i].type))
02407 curlen++;
02408 if (prs->words[i].item && !prs->words[i].repeated)
02409 poslen++;
02410 }
02411 pose = i;
02412 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
02413 continue;
02414 if (curlen >= min_words)
02415 break;
02416 }
02417 if (curlen < min_words && i >= prs->curwords)
02418 {
02419
02420 for (i = p - 1; i >= 0; i--)
02421 {
02422 if (!NONWORDTOKEN(prs->words[i].type))
02423 curlen++;
02424 if (prs->words[i].item && !prs->words[i].repeated)
02425 poslen++;
02426 if (curlen >= max_words)
02427 break;
02428 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
02429 continue;
02430 if (curlen >= min_words)
02431 break;
02432 }
02433 posb = (i >= 0) ? i : 0;
02434 }
02435 }
02436 else
02437 {
02438 for (; curlen > min_words; i--)
02439 {
02440 if (!NONWORDTOKEN(prs->words[i].type))
02441 curlen--;
02442 if (prs->words[i].item && !prs->words[i].repeated)
02443 poslen--;
02444 pose = i;
02445 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
02446 continue;
02447 break;
02448 }
02449 }
02450
02451 if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
02452 (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
02453 (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
02454 {
02455 bestb = posb;
02456 beste = pose;
02457 bestlen = poslen;
02458 }
02459
02460 p++;
02461 }
02462
02463 if (bestlen < 0)
02464 {
02465 curlen = 0;
02466 for (i = 0; i < prs->curwords && curlen < min_words; i++)
02467 {
02468 if (!NONWORDTOKEN(prs->words[i].type))
02469 curlen++;
02470 pose = i;
02471 }
02472 bestb = 0;
02473 beste = pose;
02474 }
02475 }
02476 else
02477 {
02478 bestb = 0;
02479 beste = prs->curwords - 1;
02480 }
02481
02482 for (i = bestb; i <= beste; i++)
02483 {
02484 if (prs->words[i].item)
02485 prs->words[i].selected = 1;
02486 if (highlight == 0)
02487 {
02488 if (HLIDREPLACE(prs->words[i].type))
02489 prs->words[i].replace = 1;
02490 else if (HLIDSKIP(prs->words[i].type))
02491 prs->words[i].skip = 1;
02492 }
02493 else
02494 {
02495 if (XMLHLIDSKIP(prs->words[i].type))
02496 prs->words[i].skip = 1;
02497 }
02498
02499 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
02500 }
02501
02502 }
02503
02504 Datum
02505 prsd_headline(PG_FUNCTION_ARGS)
02506 {
02507 HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
02508 List *prsoptions = (List *) PG_GETARG_POINTER(1);
02509 TSQuery query = PG_GETARG_TSQUERY(2);
02510
02511
02512 int min_words = 15;
02513 int max_words = 35;
02514 int shortword = 3;
02515 int max_fragments = 0;
02516 int highlight = 0;
02517 ListCell *l;
02518
02519
02520 prs->startsel = NULL;
02521 prs->stopsel = NULL;
02522 foreach(l, prsoptions)
02523 {
02524 DefElem *defel = (DefElem *) lfirst(l);
02525 char *val = defGetString(defel);
02526
02527 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
02528 max_words = pg_atoi(val, sizeof(int32), 0);
02529 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
02530 min_words = pg_atoi(val, sizeof(int32), 0);
02531 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
02532 shortword = pg_atoi(val, sizeof(int32), 0);
02533 else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
02534 max_fragments = pg_atoi(val, sizeof(int32), 0);
02535 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
02536 prs->startsel = pstrdup(val);
02537 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
02538 prs->stopsel = pstrdup(val);
02539 else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
02540 prs->fragdelim = pstrdup(val);
02541 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
02542 highlight = (pg_strcasecmp(val, "1") == 0 ||
02543 pg_strcasecmp(val, "on") == 0 ||
02544 pg_strcasecmp(val, "true") == 0 ||
02545 pg_strcasecmp(val, "t") == 0 ||
02546 pg_strcasecmp(val, "y") == 0 ||
02547 pg_strcasecmp(val, "yes") == 0);
02548 else
02549 ereport(ERROR,
02550 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
02551 errmsg("unrecognized headline parameter: \"%s\"",
02552 defel->defname)));
02553 }
02554
02555 if (highlight == 0)
02556 {
02557 if (min_words >= max_words)
02558 ereport(ERROR,
02559 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
02560 errmsg("MinWords should be less than MaxWords")));
02561 if (min_words <= 0)
02562 ereport(ERROR,
02563 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
02564 errmsg("MinWords should be positive")));
02565 if (shortword < 0)
02566 ereport(ERROR,
02567 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
02568 errmsg("ShortWord should be >= 0")));
02569 if (max_fragments < 0)
02570 ereport(ERROR,
02571 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
02572 errmsg("MaxFragments should be >= 0")));
02573 }
02574
02575 if (max_fragments == 0)
02576
02577 mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
02578 else
02579 mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
02580
02581 if (!prs->startsel)
02582 prs->startsel = pstrdup("<b>");
02583 if (!prs->stopsel)
02584 prs->stopsel = pstrdup("</b>");
02585 if (!prs->fragdelim)
02586 prs->fragdelim = pstrdup(" ... ");
02587 prs->startsellen = strlen(prs->startsel);
02588 prs->stopsellen = strlen(prs->stopsel);
02589 prs->fragdelimlen = strlen(prs->fragdelim);
02590
02591 PG_RETURN_POINTER(prs);
02592 }