Xapian: Internal Source Documentation: xapian-core: languages/compiler/tokeniser.c Source File

00001 
00002 #include <stdio.h>   /* stderr etc */
00003 #include <stdlib.h>  /* malloc free */
00004 #include <string.h>  /* strlen */
00005 #include <ctype.h>   /* isalpha etc */
00006 #include "header.h"
00007 
00008 struct system_word {
00009     int s_size;   /* size of system word */
00010     byte * s;     /* pointer to the system word */
00011     int code;     /* it's internal code */
00012 };
00013 
00014 
00015 /* ASCII collating assumed in syswords.c */
00016 
00017 #include "syswords.h"
00018 
00019 static int smaller(int a, int b) { return a < b ? a : b; }
00020 
00021 extern symbol * get_input(symbol * p) {
00022 
00023     char * s = b_to_s(p);
00024     {
00025         FILE * input = fopen(s, "r");
00026         free(s);
00027         if (input == 0) return 0;
00028         {
00029             symbol * u = create_b(STARTSIZE);
00030             int size = 0;
00031             repeat
00032             {   int ch = getc(input);
00033                 if (ch == EOF) break;
00034                 if (size >= CAPACITY(u)) u = increase_capacity(u, size/2);
00035                 u[size++] = ch;
00036             }
00037             fclose(input);
00038             SIZE(u) = size; return u;
00039         }
00040     }
00041 }
00042 
00043 static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2) {
00044     if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); }
00045     fprintf(stderr, "Line %d", t->line_number);
00046     if (t->get_depth > 0) fprintf(stderr, " (of included file)");
00047     fprintf(stderr, ": ");
00048     unless (s1 == 0) fprintf(stderr, "%s", s1);
00049     unless (p == 0) {
00050         int i;
00051         for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]);
00052     }
00053     unless (s2 == 0) fprintf(stderr, "%s", s2);
00054     fprintf(stderr, "\n");
00055     t->error_count++;
00056 }
00057 
00058 static void error1(struct tokeniser * t, char * s) {
00059     error(t, s, 0,0, 0);
00060 }
00061 
00062 static void error2(struct tokeniser * t, char * s) {
00063     error(t, "unexpected end of text after ", 0,0, s);
00064 }
00065 
00066 static int compare_words(int m, symbol * p, int n, byte * q) {
00067     unless (m == n) return m - n;
00068     {
00069         int i; for (i = 0; i < n; i++) {
00070             int diff = p[i] - q[i];
00071             unless (diff == 0) return diff;
00072         }
00073     }
00074     return 0;
00075 }
00076 
00077 static int find_word(int n, symbol * p) {
00078     int i = 0; int j = vocab->code;
00079     repeat {
00080         int k = i + (j - i)/2;
00081         struct system_word * w = vocab + k;
00082         int diff = compare_words(n, p, w->s_size, w->s);
00083         if (diff == 0) return w->code;
00084         if (diff < 0) j = k; else i = k;
00085         if (j - i == 1) break;
00086     }
00087     return -1;
00088 }
00089 
00090 static int get_number(int n, symbol * p) {
00091     int x = 0;
00092     int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0';
00093     return x;
00094 }
00095 
00096 static int eq_s(struct tokeniser * t, char * s) {
00097     int l = strlen(s);
00098     if (SIZE(t->p) - t->c < l) return false;
00099     {
00100         int i;
00101         for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false;
00102     }
00103     t->c += l; return true;
00104 }
00105 
00106 static int white_space(struct tokeniser * t, int ch) {
00107     switch (ch) {
00108         case '\n': t->line_number++;
00109         case '\r':
00110         case '\t':
00111         case ' ': return true;
00112     }
00113     return false;
00114 }
00115 
00116 static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) {
00117     struct m_pair * q = t->m_pairs;
00118     repeat {
00119         if (q == 0) return 0;
00120         {
00121             symbol * name = q->name;
00122             if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;
00123         }
00124         q = q->next;
00125     }
00126 }
00127 
00128 static int read_literal_string(struct tokeniser * t, int c) {
00129     symbol * p = t->p;
00130     int ch;
00131     SIZE(t->b) = 0;
00132     repeat {
00133         if (c >= SIZE(p)) { error2(t, "'"); return c; }
00134         ch = p[c];
00135         if (ch == '\n') { error1(t, "string not terminated"); return c; }
00136         c++;
00137         if (ch == t->m_start) {
00138             int c0 = c;
00139             int newlines = false; /* no newlines as yet */
00140             int black_found = false; /* no printing chars as yet */
00141             repeat {
00142                 if (c >= SIZE(p)) { error2(t, "'"); return c; }
00143                 ch = p[c]; c++;
00144                 if (ch == t->m_end) break;
00145                 unless (white_space(t, ch)) black_found = true;
00146                 if (ch == '\n') newlines = true;
00147                 if (newlines && black_found) {
00148                     error1(t, "string not terminated");
00149                     return c;
00150                 }
00151             }
00152             unless (newlines) {
00153                 int n = c - c0 - 1;    /* macro size */
00154                 int firstch = p[c0];
00155                 symbol * q = find_in_m(t, n, p + c0);
00156                 if (q == 0) {
00157                     if (n == 1 && (firstch == '\'' || firstch == t->m_start))
00158                         t->b = add_to_b(t->b, 1, p + c0);
00159                     else
00160                         error(t, "string macro '", n, p + c0, "' undeclared");
00161                 } else
00162                     t->b = add_to_b(t->b, SIZE(q), q);
00163             }
00164         } else {
00165             if (ch == '\'') return c;
00166             t->b = add_to_b(t->b, 1, p + c - 1);
00167         }
00168     }
00169 }
00170 
00171 static int next_token(struct tokeniser * t) {
00172     symbol * p = t->p;
00173     int c = t->c;
00174     int ch;
00175     int code = -1;
00176     repeat {
00177         if (c >= SIZE(p)) { t->c = c; return -1; }
00178         ch = p[c];
00179         if (white_space(t, ch)) { c++; continue; }
00180         if (isalpha(ch)) {
00181             int c0 = c;
00182             while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++;
00183             code = find_word(c - c0, p + c0);
00184             if (code < 0) {
00185                 t->b = move_to_b(t->b, c - c0, p + c0);
00186                 code = c_name;
00187             }
00188         } else
00189         if (isdigit(ch)) {
00190             int c0 = c;
00191             while (c < SIZE(p) && isdigit(p[c])) c++;
00192             t->number = get_number(c - c0, p + c0);
00193             code = c_number;
00194         } else
00195         if (ch == '\'') {
00196             c = read_literal_string(t, c + 1);
00197             code = c_literalstring;
00198         } else
00199         {
00200             int lim = smaller(2, SIZE(p) - c);
00201             int i;
00202             for (i = lim; i > 0; i--) {
00203                 code = find_word(i, p + c);
00204                 if (code >= 0) { c += i; break; }
00205             }
00206         }
00207         if (code >= 0) {
00208             t->c = c;
00209             return code;
00210         }
00211         error(t, "'", 1, p + c, "' unknown");
00212         c++;
00213         continue;
00214     }
00215 }
00216 
00217 static int next_char(struct tokeniser * t) {
00218     if (t->c >= SIZE(t->p)) return -1;
00219     return t->p[t->c++];
00220 }
00221 
00222 static int next_real_char(struct tokeniser * t) {
00223     repeat {
00224         int ch = next_char(t);
00225         if (white_space(t, ch)) continue;
00226         return ch;
00227     }
00228 }
00229 
00230 static void read_chars(struct tokeniser * t) {
00231     int ch = next_real_char(t);
00232     if (ch < 0) { error2(t, "stringdef"); return; }
00233     {
00234         int c0 = t->c-1;
00235         repeat {
00236             ch = next_char(t);
00237             if (white_space(t, ch) || ch < 0) break;
00238         }
00239         t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0);
00240     }
00241 }
00242 
00243 static int decimal_to_num(int ch) {
00244     if ('0' <= ch && ch <= '9') return ch - '0';
00245     return -1;
00246 }
00247 
00248 static int hex_to_num(int ch) {
00249     if ('0' <= ch && ch <= '9') return ch - '0';
00250     if ('a' <= ch && ch <= 'f') return ch - 'a' + 10;
00251     return -1;
00252 }
00253 
00254 static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
00255     int c = 0; int d = 0;
00256     repeat {
00257         while (c < SIZE(p) && p[c] == ' ') c++;
00258         if (c == SIZE(p)) break;
00259         {
00260             int number = 0;
00261             repeat {
00262                 int ch = p[c];
00263                 if (c == SIZE(p) || ch == ' ') break;
00264                 if (base == 10) {
00265                     ch = decimal_to_num(ch);
00266                     if (ch < 0) {
00267                         error1(t, "decimal string contains non-digits");
00268                         return;
00269                     }
00270                 } else {
00271                     ch = hex_to_num(tolower(ch));
00272                     if (ch < 0) {
00273                         error1(t, "hex string contains non-hex characters");
00274                         return;
00275                     }
00276                 }
00277                 number = base * number + ch;
00278                 c++;
00279             }
00280             if (t->widechars || t->utf8) {
00281                 unless (0 <= number && number <= 0xffff) {
00282                     error1(t, "character values exceed 64K");
00283                     return;
00284                 }
00285             } else {
00286                 unless (0 <= number && number <= 0xff) {
00287                     error1(t, "character values exceed 256");
00288                     return;
00289                 }
00290             }
00291             if (t->utf8)
00292                 d += put_utf8(number, p + d);
00293             else
00294                 p[d++] = number;
00295         }
00296     }
00297     SIZE(p) = d;
00298 }
00299 
00300 extern int read_token(struct tokeniser * t) {
00301     symbol * p = t->p;
00302     int held = t->token_held;
00303     t->token_held = false;
00304     if (held) return t->token;
00305     repeat {
00306         int code = next_token(t);
00307         switch (code) {
00308             case c_comment1: /*  slash-slash comment */
00309                while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
00310                continue;
00311             case c_comment2: /* slash-star comment */
00312                repeat {
00313                    if (t->c >= SIZE(p)) {
00314                        error1(t, "/* comment not terminated");
00315                        t->token = -1;
00316                        return -1;
00317                    }
00318                    if (p[t->c] == '\n') t->line_number++;
00319                    if (eq_s(t, "*/")) break;
00320                    t->c++;
00321                }
00322                continue;
00323             case c_stringescapes:
00324                {
00325                    int ch1 = next_real_char(t);
00326                    int ch2 = next_real_char(t);
00327                    if (ch2 < 0)
00328                        { error2(t, "stringescapes"); continue; }
00329                    if (ch1 == '\'')
00330                        { error1(t, "first stringescape cannot be '"); continue; }
00331                    t->m_start = ch1;
00332                    t->m_end = ch2;
00333                }
00334                continue;
00335             case c_stringdef:
00336                {
00337                    int base = 0;
00338                    read_chars(t);
00339                    code = read_token(t);
00340                    if (code == c_hex) { base = 16; code = read_token(t); } else
00341                    if (code == c_decimal) { base = 10; code = read_token(t); }
00342                    unless (code == c_literalstring)
00343                        { error1(t, "string omitted after stringdef"); continue; }
00344                    if (base > 0) convert_numeric_string(t, t->b, base);
00345                    {   NEW(m_pair, q);
00346                        q->next = t->m_pairs;
00347                        q->name = copy_b(t->b2);
00348                        q->value = copy_b(t->b);
00349                        t->m_pairs = q;
00350                    }
00351                }
00352                continue;
00353             case c_get:
00354                code = read_token(t);
00355                unless (code == c_literalstring) {
00356                    error1(t, "string omitted after get"); continue;
00357                }
00358                t->get_depth++;
00359                if (t->get_depth > 10) {
00360                    fprintf(stderr, "get directives go 10 deep. Looping?\n");
00361                    exit(1);
00362                }
00363                {
00364                    NEW(input, q);
00365                    symbol * u = get_input(t->b);
00366                    if (u == 0) {
00367                        struct include * r = t->includes;
00368                        until (r == 0) {
00369                            symbol * b = copy_b(r->b);
00370                            b = add_to_b(b, SIZE(t->b), t->b);
00371                            u = get_input(b);
00372                            lose_b(b);
00373                            unless (u == 0) break;
00374                            r = r->next;
00375                        }
00376                    }
00377                    if (u == 0) {
00378                        error(t, "Can't get '", SIZE(t->b), t->b, "'");
00379                        exit(1);
00380                    }
00381                    memmove(q, t, sizeof(struct input));
00382                    t->next = q;
00383                    t->p = u;
00384                    t->c = 0;
00385                    t->line_number = 1;
00386                }
00387                p = t->p;
00388                continue;
00389             case -1:
00390                unless (t->next == 0) {
00391                    lose_b(p);
00392                    {
00393                        struct input * q = t->next;
00394                        memmove(t, q, sizeof(struct input)); p = t->p;
00395                        FREE(q);
00396                    }
00397                    t->get_depth--;
00398                    continue;
00399                }
00400                /* drop through */
00401             default:
00402                 t->previous_token = t->token;
00403                 t->token = code;
00404                 return code;
00405         }
00406     }
00407 }
00408 
00409 extern byte * name_of_token(int code) {
00410     int i;
00411     for (i = 1; i < vocab->code; i++)
00412         if ((vocab + i)->code == code) return (vocab + i)->s;
00413     switch (code) {
00414         case c_mathassign:   return (byte *) "=";
00415         case c_name:         return (byte *) "name";
00416         case c_number:       return (byte *) "number";
00417         case c_literalstring:return (byte *) "literal";
00418         case c_neg:          return (byte *) "neg";
00419         case c_grouping:     return (byte *) "grouping";
00420         case c_call:         return (byte *) "call";
00421         case c_booltest:     return (byte *) "Boolean test";
00422         case -2:             return (byte *) "start of text";
00423         case -1:             return (byte *) "end of text";
00424         default:             return (byte *) "?";
00425     }
00426 }
00427 
00428 extern struct tokeniser * create_tokeniser(symbol * p) {
00429     NEW(tokeniser, t);
00430     t->next = 0;
00431     t->p = p;
00432     t->c = 0;
00433     t->line_number = 1;
00434     t->b = create_b(0);
00435     t->b2 = create_b(0);
00436     t->m_start = -1;
00437     t->m_pairs = 0;
00438     t->get_depth = 0;
00439     t->error_count = 0;
00440     t->token_held = false;
00441     t->token = -2;
00442     t->previous_token = -2;
00443     return t;
00444 }
00445 
00446 extern void close_tokeniser(struct tokeniser * t) {
00447     lose_b(t->b);
00448     lose_b(t->b2);
00449     {
00450         struct m_pair * q = t->m_pairs;
00451         until (q == 0) {
00452             struct m_pair * q_next = q->next;
00453             lose_b(q->name);
00454             lose_b(q->value);
00455             FREE(q);
00456             q = q_next;
00457         }
00458     }
00459     {
00460         struct input * q = t->next;
00461         until (q == 0) {
00462             struct input * q_next = q->next;
00463             FREE(q);
00464             q = q_next;
00465         }
00466     }
00467     FREE(t);
00468 }
00469