00001
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 #include <string.h>
00005 #include <ctype.h>
00006 #include "header.h"
00007
00008 struct system_word {
00009 int s_size;
00010 byte * s;
00011 int code;
00012 };
00013
00014
00015
00016
00017 #include "syswords.h"
00018
00019 static int smaller(int a, int b) { return a < b ? a : b; }
00020
00021 extern symbol * get_input(symbol * p) {
00022
00023 char * s = b_to_s(p);
00024 {
00025 FILE * input = fopen(s, "r");
00026 free(s);
00027 if (input == 0) return 0;
00028 {
00029 symbol * u = create_b(STARTSIZE);
00030 int size = 0;
00031 repeat
00032 { int ch = getc(input);
00033 if (ch == EOF) break;
00034 if (size >= CAPACITY(u)) u = increase_capacity(u, size/2);
00035 u[size++] = ch;
00036 }
00037 fclose(input);
00038 SIZE(u) = size; return u;
00039 }
00040 }
00041 }
00042
00043 static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2) {
00044 if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); }
00045 fprintf(stderr, "Line %d", t->line_number);
00046 if (t->get_depth > 0) fprintf(stderr, " (of included file)");
00047 fprintf(stderr, ": ");
00048 unless (s1 == 0) fprintf(stderr, "%s", s1);
00049 unless (p == 0) {
00050 int i;
00051 for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]);
00052 }
00053 unless (s2 == 0) fprintf(stderr, "%s", s2);
00054 fprintf(stderr, "\n");
00055 t->error_count++;
00056 }
00057
00058 static void error1(struct tokeniser * t, char * s) {
00059 error(t, s, 0,0, 0);
00060 }
00061
00062 static void error2(struct tokeniser * t, char * s) {
00063 error(t, "unexpected end of text after ", 0,0, s);
00064 }
00065
00066 static int compare_words(int m, symbol * p, int n, byte * q) {
00067 unless (m == n) return m - n;
00068 {
00069 int i; for (i = 0; i < n; i++) {
00070 int diff = p[i] - q[i];
00071 unless (diff == 0) return diff;
00072 }
00073 }
00074 return 0;
00075 }
00076
00077 static int find_word(int n, symbol * p) {
00078 int i = 0; int j = vocab->code;
00079 repeat {
00080 int k = i + (j - i)/2;
00081 struct system_word * w = vocab + k;
00082 int diff = compare_words(n, p, w->s_size, w->s);
00083 if (diff == 0) return w->code;
00084 if (diff < 0) j = k; else i = k;
00085 if (j - i == 1) break;
00086 }
00087 return -1;
00088 }
00089
00090 static int get_number(int n, symbol * p) {
00091 int x = 0;
00092 int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0';
00093 return x;
00094 }
00095
00096 static int eq_s(struct tokeniser * t, char * s) {
00097 int l = strlen(s);
00098 if (SIZE(t->p) - t->c < l) return false;
00099 {
00100 int i;
00101 for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false;
00102 }
00103 t->c += l; return true;
00104 }
00105
00106 static int white_space(struct tokeniser * t, int ch) {
00107 switch (ch) {
00108 case '\n': t->line_number++;
00109 case '\r':
00110 case '\t':
00111 case ' ': return true;
00112 }
00113 return false;
00114 }
00115
00116 static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) {
00117 struct m_pair * q = t->m_pairs;
00118 repeat {
00119 if (q == 0) return 0;
00120 {
00121 symbol * name = q->name;
00122 if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;
00123 }
00124 q = q->next;
00125 }
00126 }
00127
00128 static int read_literal_string(struct tokeniser * t, int c) {
00129 symbol * p = t->p;
00130 int ch;
00131 SIZE(t->b) = 0;
00132 repeat {
00133 if (c >= SIZE(p)) { error2(t, "'"); return c; }
00134 ch = p[c];
00135 if (ch == '\n') { error1(t, "string not terminated"); return c; }
00136 c++;
00137 if (ch == t->m_start) {
00138 int c0 = c;
00139 int newlines = false;
00140 int black_found = false;
00141 repeat {
00142 if (c >= SIZE(p)) { error2(t, "'"); return c; }
00143 ch = p[c]; c++;
00144 if (ch == t->m_end) break;
00145 unless (white_space(t, ch)) black_found = true;
00146 if (ch == '\n') newlines = true;
00147 if (newlines && black_found) {
00148 error1(t, "string not terminated");
00149 return c;
00150 }
00151 }
00152 unless (newlines) {
00153 int n = c - c0 - 1;
00154 int firstch = p[c0];
00155 symbol * q = find_in_m(t, n, p + c0);
00156 if (q == 0) {
00157 if (n == 1 && (firstch == '\'' || firstch == t->m_start))
00158 t->b = add_to_b(t->b, 1, p + c0);
00159 else
00160 error(t, "string macro '", n, p + c0, "' undeclared");
00161 } else
00162 t->b = add_to_b(t->b, SIZE(q), q);
00163 }
00164 } else {
00165 if (ch == '\'') return c;
00166 t->b = add_to_b(t->b, 1, p + c - 1);
00167 }
00168 }
00169 }
00170
00171 static int next_token(struct tokeniser * t) {
00172 symbol * p = t->p;
00173 int c = t->c;
00174 int ch;
00175 int code = -1;
00176 repeat {
00177 if (c >= SIZE(p)) { t->c = c; return -1; }
00178 ch = p[c];
00179 if (white_space(t, ch)) { c++; continue; }
00180 if (isalpha(ch)) {
00181 int c0 = c;
00182 while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++;
00183 code = find_word(c - c0, p + c0);
00184 if (code < 0) {
00185 t->b = move_to_b(t->b, c - c0, p + c0);
00186 code = c_name;
00187 }
00188 } else
00189 if (isdigit(ch)) {
00190 int c0 = c;
00191 while (c < SIZE(p) && isdigit(p[c])) c++;
00192 t->number = get_number(c - c0, p + c0);
00193 code = c_number;
00194 } else
00195 if (ch == '\'') {
00196 c = read_literal_string(t, c + 1);
00197 code = c_literalstring;
00198 } else
00199 {
00200 int lim = smaller(2, SIZE(p) - c);
00201 int i;
00202 for (i = lim; i > 0; i--) {
00203 code = find_word(i, p + c);
00204 if (code >= 0) { c += i; break; }
00205 }
00206 }
00207 if (code >= 0) {
00208 t->c = c;
00209 return code;
00210 }
00211 error(t, "'", 1, p + c, "' unknown");
00212 c++;
00213 continue;
00214 }
00215 }
00216
00217 static int next_char(struct tokeniser * t) {
00218 if (t->c >= SIZE(t->p)) return -1;
00219 return t->p[t->c++];
00220 }
00221
00222 static int next_real_char(struct tokeniser * t) {
00223 repeat {
00224 int ch = next_char(t);
00225 if (white_space(t, ch)) continue;
00226 return ch;
00227 }
00228 }
00229
00230 static void read_chars(struct tokeniser * t) {
00231 int ch = next_real_char(t);
00232 if (ch < 0) { error2(t, "stringdef"); return; }
00233 {
00234 int c0 = t->c-1;
00235 repeat {
00236 ch = next_char(t);
00237 if (white_space(t, ch) || ch < 0) break;
00238 }
00239 t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0);
00240 }
00241 }
00242
00243 static int decimal_to_num(int ch) {
00244 if ('0' <= ch && ch <= '9') return ch - '0';
00245 return -1;
00246 }
00247
00248 static int hex_to_num(int ch) {
00249 if ('0' <= ch && ch <= '9') return ch - '0';
00250 if ('a' <= ch && ch <= 'f') return ch - 'a' + 10;
00251 return -1;
00252 }
00253
00254 static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
00255 int c = 0; int d = 0;
00256 repeat {
00257 while (c < SIZE(p) && p[c] == ' ') c++;
00258 if (c == SIZE(p)) break;
00259 {
00260 int number = 0;
00261 repeat {
00262 int ch = p[c];
00263 if (c == SIZE(p) || ch == ' ') break;
00264 if (base == 10) {
00265 ch = decimal_to_num(ch);
00266 if (ch < 0) {
00267 error1(t, "decimal string contains non-digits");
00268 return;
00269 }
00270 } else {
00271 ch = hex_to_num(tolower(ch));
00272 if (ch < 0) {
00273 error1(t, "hex string contains non-hex characters");
00274 return;
00275 }
00276 }
00277 number = base * number + ch;
00278 c++;
00279 }
00280 if (t->widechars || t->utf8) {
00281 unless (0 <= number && number <= 0xffff) {
00282 error1(t, "character values exceed 64K");
00283 return;
00284 }
00285 } else {
00286 unless (0 <= number && number <= 0xff) {
00287 error1(t, "character values exceed 256");
00288 return;
00289 }
00290 }
00291 if (t->utf8)
00292 d += put_utf8(number, p + d);
00293 else
00294 p[d++] = number;
00295 }
00296 }
00297 SIZE(p) = d;
00298 }
00299
00300 extern int read_token(struct tokeniser * t) {
00301 symbol * p = t->p;
00302 int held = t->token_held;
00303 t->token_held = false;
00304 if (held) return t->token;
00305 repeat {
00306 int code = next_token(t);
00307 switch (code) {
00308 case c_comment1:
00309 while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
00310 continue;
00311 case c_comment2:
00312 repeat {
00313 if (t->c >= SIZE(p)) {
00314 error1(t, "/* comment not terminated");
00315 t->token = -1;
00316 return -1;
00317 }
00318 if (p[t->c] == '\n') t->line_number++;
00319 if (eq_s(t, "*/")) break;
00320 t->c++;
00321 }
00322 continue;
00323 case c_stringescapes:
00324 {
00325 int ch1 = next_real_char(t);
00326 int ch2 = next_real_char(t);
00327 if (ch2 < 0)
00328 { error2(t, "stringescapes"); continue; }
00329 if (ch1 == '\'')
00330 { error1(t, "first stringescape cannot be '"); continue; }
00331 t->m_start = ch1;
00332 t->m_end = ch2;
00333 }
00334 continue;
00335 case c_stringdef:
00336 {
00337 int base = 0;
00338 read_chars(t);
00339 code = read_token(t);
00340 if (code == c_hex) { base = 16; code = read_token(t); } else
00341 if (code == c_decimal) { base = 10; code = read_token(t); }
00342 unless (code == c_literalstring)
00343 { error1(t, "string omitted after stringdef"); continue; }
00344 if (base > 0) convert_numeric_string(t, t->b, base);
00345 { NEW(m_pair, q);
00346 q->next = t->m_pairs;
00347 q->name = copy_b(t->b2);
00348 q->value = copy_b(t->b);
00349 t->m_pairs = q;
00350 }
00351 }
00352 continue;
00353 case c_get:
00354 code = read_token(t);
00355 unless (code == c_literalstring) {
00356 error1(t, "string omitted after get"); continue;
00357 }
00358 t->get_depth++;
00359 if (t->get_depth > 10) {
00360 fprintf(stderr, "get directives go 10 deep. Looping?\n");
00361 exit(1);
00362 }
00363 {
00364 NEW(input, q);
00365 symbol * u = get_input(t->b);
00366 if (u == 0) {
00367 struct include * r = t->includes;
00368 until (r == 0) {
00369 symbol * b = copy_b(r->b);
00370 b = add_to_b(b, SIZE(t->b), t->b);
00371 u = get_input(b);
00372 lose_b(b);
00373 unless (u == 0) break;
00374 r = r->next;
00375 }
00376 }
00377 if (u == 0) {
00378 error(t, "Can't get '", SIZE(t->b), t->b, "'");
00379 exit(1);
00380 }
00381 memmove(q, t, sizeof(struct input));
00382 t->next = q;
00383 t->p = u;
00384 t->c = 0;
00385 t->line_number = 1;
00386 }
00387 p = t->p;
00388 continue;
00389 case -1:
00390 unless (t->next == 0) {
00391 lose_b(p);
00392 {
00393 struct input * q = t->next;
00394 memmove(t, q, sizeof(struct input)); p = t->p;
00395 FREE(q);
00396 }
00397 t->get_depth--;
00398 continue;
00399 }
00400
00401 default:
00402 t->previous_token = t->token;
00403 t->token = code;
00404 return code;
00405 }
00406 }
00407 }
00408
00409 extern byte * name_of_token(int code) {
00410 int i;
00411 for (i = 1; i < vocab->code; i++)
00412 if ((vocab + i)->code == code) return (vocab + i)->s;
00413 switch (code) {
00414 case c_mathassign: return (byte *) "=";
00415 case c_name: return (byte *) "name";
00416 case c_number: return (byte *) "number";
00417 case c_literalstring:return (byte *) "literal";
00418 case c_neg: return (byte *) "neg";
00419 case c_grouping: return (byte *) "grouping";
00420 case c_call: return (byte *) "call";
00421 case c_booltest: return (byte *) "Boolean test";
00422 case -2: return (byte *) "start of text";
00423 case -1: return (byte *) "end of text";
00424 default: return (byte *) "?";
00425 }
00426 }
00427
00428 extern struct tokeniser * create_tokeniser(symbol * p) {
00429 NEW(tokeniser, t);
00430 t->next = 0;
00431 t->p = p;
00432 t->c = 0;
00433 t->line_number = 1;
00434 t->b = create_b(0);
00435 t->b2 = create_b(0);
00436 t->m_start = -1;
00437 t->m_pairs = 0;
00438 t->get_depth = 0;
00439 t->error_count = 0;
00440 t->token_held = false;
00441 t->token = -2;
00442 t->previous_token = -2;
00443 return t;
00444 }
00445
00446 extern void close_tokeniser(struct tokeniser * t) {
00447 lose_b(t->b);
00448 lose_b(t->b2);
00449 {
00450 struct m_pair * q = t->m_pairs;
00451 until (q == 0) {
00452 struct m_pair * q_next = q->next;
00453 lose_b(q->name);
00454 lose_b(q->value);
00455 FREE(q);
00456 q = q_next;
00457 }
00458 }
00459 {
00460 struct input * q = t->next;
00461 until (q == 0) {
00462 struct input * q_next = q->next;
00463 FREE(q);
00464 q = q_next;
00465 }
00466 }
00467 FREE(t);
00468 }
00469