The Battle for Wesnoth  1.13.4+dev
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tokenizer.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2007 - 2016 by David White <dave.net>
3  Part of the Silver Tree Project
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by or later.
7  This program is distributed in the hope that it will be useful,
8  but WITHOUT ANY WARRANTY.
9 
10  See the COPYING file for more details.
11 */
12 
13 #include <sstream>
14 
15 #include "formula/tokenizer.hpp"
16 
18 {
19 
20 namespace {
21 
22 void raise_exception(iterator& i1, iterator i2, std::string str) {
23  std::ostringstream expr;
24  while( (i1 != i2) && (*i1 != '\n') ) {
25  if( (*i1 != '\t') )
26  expr << *i1;
27  ++i1;
28  }
29 
30  if( str.empty() )
31  throw token_error("Unrecognized token", expr.str() );
32  else
33  throw token_error(str, expr.str() );
34 }
35 
36 }
37 
38 token get_token(iterator& i1, const iterator i2) {
39 
40  iterator it = i1;
41  if( *i1 >= 'A' ) {
42  //current character is >= 'A', limit search to the upper-half of the ASCII table
43 
44  // check if we parse now TOKEN_IDENTIFIER or TOKEN_OPERATOR/KEYWORD based on string
45  if( *i1 <= 'Z' || ( *i1 >= 'a' && *it <= 'z' ) || *i1 == '_' ) {
46 
47  while( i1 != i2 && ( ( *i1 >= 'a' && *i1 <= 'z' ) || *i1 == '_' || ( *i1 >= 'A' && *i1 <= 'Z' ) ) )
48  ++i1;
49 
50  int diff = i1 - it;
52 
53  //check if this string matches any keyword or an operator
54  //possible operators and keywords:
55  // d, or, in, def, and, not, wfl, where, wflend, functions
56  if( diff == 1 ) {
57  if( *it == 'd' )
58  t = TOKEN_OPERATOR;
59  } else if( diff == 2 ) {
60  if( *it == 'o' && *(it+1) == 'r' )
61  t = TOKEN_OPERATOR;
62  else if( *it == 'i' && *(it+1) == 'n' )
63  t = TOKEN_OPERATOR;
64  } else if( diff == 3 ) {
65  if( *it == 'd' ) { //def
66  if( *(it+1) == 'e' && *(it+2) == 'f' )
67  t = TOKEN_KEYWORD;
68  } else if( *it == 'a' ) { //and
69  if( *(it+1) == 'n' && *(it+2) == 'd' )
70  t = TOKEN_OPERATOR;
71  } else if( *it == 'n' ) { //not
72  if( *(it+1) == 'o' && *(it+2) == 't' )
73  t = TOKEN_OPERATOR;
74  } else if( *it == 'f' ) { //fai
75  if( *(it+1) == 'a' && *(it+2) == 'i' )
76  t = TOKEN_KEYWORD;
77  } else if( *it == 'w' ) { //wfl
78  if( *(it+1) == 'f' && *(it+2) == 'l' )
79  t = TOKEN_KEYWORD;
80  }
81  } else if( diff == 5 ) {
82  std::string s(it, i1);
83  if( s == "where" )
84  t = TOKEN_OPERATOR;
85  } else if( diff == 6 ) {
86  std::string s(it, i1);
87  if( s == "faiend" )
88  t = TOKEN_KEYWORD;
89  else if( s == "wflend" )
90  t = TOKEN_KEYWORD;
91  } else if( diff == 9 ) {
92  std::string s(it, i1);
93  if( s == "functions" )
94  t = TOKEN_KEYWORD;
95  }
96 
97  return token( it, i1, t);
98  } else {
99  //at this point only 3 chars left to check:
100  if( *i1 == '[' )
101  return token( it, ++i1, TOKEN_LSQUARE );
102 
103  if( *i1 == ']' )
104  return token( it, ++i1, TOKEN_RSQUARE );
105 
106  if( *i1 == '^' )
107  return token( it, ++i1, TOKEN_OPERATOR );
108 
109  if( *i1 == '~' )
110  return token( it, ++i1, TOKEN_OPERATOR );
111 
112  //unused characters in this range:
113  // \ ` { | }
114  // Note: {} should never be used since they play poorly with WML preprocessor
115  }
116  } else {
117  //limit search to the lower-half of the ASCII table
118  //start by checking for whitespaces/end of line char
119  if( *i1 <= ' ' ) {
120  if( *i1 == '\n' ) {
121  return token( it, ++i1, TOKEN_EOL);
122  } else {
123 
124  while( i1 != i2 && *i1 <= ' ' && *i1 != '\n' )
125  ++i1;
126 
127  return token( it, i1, TOKEN_WHITESPACE );
128  }
129  //try to further limit number of characters that we need to check:
130  } else if ( *i1 >= '0' ){
131  //current character is between '0' and '@'
132  if( *i1 <= '9' ) {
133  //we parse integer or decimal number
134  ++i1;
135  bool dot = false;
136 
137  while( i1 != i2 ) {
138  if( *i1 >= '0' && *i1 <= '9' ) {
139  //do nothing
140  } else {
141  //look for '.' in case of decimal number
142  if( *i1 == '.' ) {
143  //allow only one dot in such expression
144  if( !dot )
145  dot = true;
146  else
147  raise_exception(it, i2, "Multiple dots near decimal expression");
148  } else
149  break;
150  }
151  ++i1;
152  }
153 
154  if( dot )
155  return token( it, i1, TOKEN_DECIMAL );
156  else
157  return token( it, i1, TOKEN_INTEGER );
158 
159  } else {
160  //current character is between ':' and '@'
161  //possible tokens at this point that we are interested in:
162  // ; < = > <= >=
163  //unused characters in this range:
164  // : ? @
165 
166  if( *i1 == ';' ) {
167  return token( it, ++i1, TOKEN_SEMICOLON);
168  } else if( *i1 == '=' ) {
169  return token( it, ++i1, TOKEN_OPERATOR);
170  } else if( *i1 == '<' ) {
171  ++i1;
172  if( i1 != i2 ) {
173  if( *i1 == '=' )
174  return token( it, ++i1, TOKEN_OPERATOR);
175  else
176  return token( it, i1, TOKEN_OPERATOR);
177  } else
178  return token( it, i1, TOKEN_OPERATOR);
179  } else if( *i1 == '>' ) {
180  ++i1;
181  if( i1 != i2 ) {
182  if( *i1 == '=' )
183  return token( it, ++i1, TOKEN_OPERATOR);
184  else
185  return token( it, i1, TOKEN_OPERATOR);
186  } else
187  return token( it, i1, TOKEN_OPERATOR);
188  }
189  }
190  //current character is between '!' and '/'
191  //possible tokens:
192  // , . .+ .- .* ./ .. ( ) ' # + - -> * / % !=
193  //unused characters:
194  // ! " $ &
195  // ! is used only as part of !=
196  // Note: " should never be used since it plays poorly with WML
197  } else if ( *i1 == ',' ) {
198  return token( it, ++i1, TOKEN_COMMA);
199 
200  } else if ( *i1 == '.' ) {
201  ++i1;
202 
203  if( i1 != i2 ) {
204  if( *i1 == '+' || *i1 == '-' || *i1 == '*' || *i1 == '/' || *i1 == '.')
205  return token( it, ++i1, TOKEN_OPERATOR );
206  else
207  return token( it, i1, TOKEN_OPERATOR );
208  } else {
209  return token( it, i1, TOKEN_OPERATOR);
210  }
211 
212  } else if ( *i1 == '(' ) {
213  return token( it, ++i1, TOKEN_LPARENS);
214 
215  } else if ( *i1 == ')' ) {
216  return token( it, ++i1, TOKEN_RPARENS);
217 
218  } else if ( *i1 == '\'' ) {
219  int bracket_depth = 0;
220  ++i1;
221  while (i1 != i2) {
222  if (*i1 == '[') {
223  bracket_depth++;
224  } else if(bracket_depth > 0 && *i1 == ']') {
225  bracket_depth--;
226  } else if(bracket_depth == 0 && *i1 == '\'') {
227  break;
228  }
229  ++i1;
230  }
231 
232  if( i1 != i2 ) {
233  return token( it, ++i1, TOKEN_STRING_LITERAL );
234  } else {
235  raise_exception(it, i2, "Missing closing ' for formula string");
236  }
237 
238  } else if ( *i1 == '#' ) {
239  ++i1;
240  while( i1 != i2 && *i1 != '#' )
241  ++i1;
242 
243  if( i1 != i2 ) {
244  return token( it, ++i1, TOKEN_COMMENT );
245  } else {
246  raise_exception(it, i2, "Missing closing # for formula comment");
247  }
248 
249  } else if ( *i1 == '+' ) {
250  return token( it, ++i1, TOKEN_OPERATOR);
251 
252  } else if ( *i1 == '-' ) {
253  ++i1;
254 
255  if( i1 != i2 ) {
256  if( *i1 == '>' )
257  return token( it, ++i1, TOKEN_POINTER );
258  else
259  return token( it, i1, TOKEN_OPERATOR );
260  } else {
261  return token( it, i1, TOKEN_OPERATOR);
262  }
263 
264  } else if ( *i1 == '*' ) {
265  return token( it, ++i1, TOKEN_OPERATOR);
266 
267  } else if ( *i1 == '/' ) {
268  return token( it, ++i1, TOKEN_OPERATOR);
269 
270  } else if ( *i1 == '%' ) {
271  return token( it, ++i1, TOKEN_OPERATOR);
272 
273  } else if ( *i1 == '!' ) {
274  ++i1;
275  if( *i1 == '=' )
276  return token( it, ++i1, TOKEN_OPERATOR);
277  else
278  raise_exception(it, i2, std::string() );
279  }
280  }
281  raise_exception(it, i2, std::string() );
282  return token();
283 }
284 
285 }
286 
287 #ifdef UNIT_TEST_TOKENIZER
288 
289 int main()
290 {
291  using namespace formula_tokenizer;
292  std::string test = "(abc + 4 * (5+3))^2";
293  std::string::const_iterator i1 = test.begin();
294  std::string::const_iterator i2 = test.end();
303  TOKEN_OPERATOR, TOKEN_INTEGER};
304  std::string tokens[] = {"(", "abc", " ", "+", " ", "4", " ",
305  "*", " ", "(", "5", "+", "3", ")", ")", "functions"};
306  for(int n = 0; n != sizeof(types)/sizeof(*types); ++n) {
307  token t = get_token(i1,i2);
308  assert(std::string(t.begin,t.end) == tokens[n]);
309  assert(t.type == types[n]);
310 
311  }
312  return 0;
313 }
314 
315 #endif
TOKEN_TYPE
TOKEN_TYPE is already defined in a Winnt.h (a windows header wich is included under some conditions...
Definition: tokenizer.hpp:24
GLdouble GLdouble t
Definition: glew.h:1366
GLsizei GLenum GLenum * types
Definition: glew.h:3155
token get_token(iterator &i1, const iterator i2)
Definition: tokenizer.cpp:38
static void expr(LexState *ls, expdesc *v)
Definition: lparser.cpp:1066
int main(int argc, char **argv)
GLclampd n
Definition: glew.h:5903
std::string::const_iterator iterator
Definition: tokenizer.hpp:21
GLdouble s
Definition: glew.h:1358
GLsizei const GLcharARB ** string
Definition: glew.h:4503
static void test()