TrinityCore
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tokenizer.h
Go to the documentation of this file.
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // Author: [email protected] (Kenton Varda)
32 // Based on original Protocol Buffers design by
33 // Sanjay Ghemawat, Jeff Dean, and others.
34 //
35 // Class for parsing tokenized text from a ZeroCopyInputStream.
36 
37 #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
38 #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
39 
40 #include <string>
41 #include <vector>
43 
44 namespace google {
45 namespace protobuf {
46 namespace io {
47 
48 class ZeroCopyInputStream; // zero_copy_stream.h
49 
50 // Defined in this file.
51 class ErrorCollector;
52 class Tokenizer;
53 
54 // Abstract interface for an object which collects the errors that occur
55 // during parsing. A typical implementation might simply print the errors
56 // to stdout.
58  public:
59  inline ErrorCollector() {}
60  virtual ~ErrorCollector();
61 
62  // Indicates that there was an error in the input at the given line and
63  // column numbers. The numbers are zero-based, so you may want to add
64  // 1 to each before printing them.
65  virtual void AddError(int line, int column, const string& message) = 0;
66 
67  // Indicates that there was a warning in the input at the given line and
68  // column numbers. The numbers are zero-based, so you may want to add
69  // 1 to each before printing them.
70  virtual void AddWarning(int /* line */, int /* column */,
71  const string& /* message */) { }
72 
73  private:
75 };
76 
77 // This class converts a stream of raw text into a stream of tokens for
78 // the protocol definition parser to parse. The tokens recognized are
79 // similar to those that make up the C language; see the TokenType enum for
80 // precise descriptions. Whitespace and comments are skipped. By default,
81 // C- and C++-style comments are recognized, but other styles can be used by
82 // calling set_comment_style().
84  public:
85  // Construct a Tokenizer that reads and tokenizes text from the given
86  // input stream and writes errors to the given error_collector.
87  // The caller keeps ownership of input and error_collector.
89  ~Tokenizer();
90 
91  enum TokenType {
92  TYPE_START, // Next() has not yet been called.
93  TYPE_END, // End of input reached. "text" is empty.
94 
95  TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not
96  // starting with a digit. It is an error for a number
97  // to be followed by an identifier with no space in
98  // between.
99  TYPE_INTEGER, // A sequence of digits representing an integer. Normally
100  // the digits are decimal, but a prefix of "0x" indicates
101  // a hex number and a leading zero indicates octal, just
102  // like with C numeric literals. A leading negative sign
103  // is NOT included in the token; it's up to the parser to
104  // interpret the unary minus operator on its own.
105  TYPE_FLOAT, // A floating point literal, with a fractional part and/or
106  // an exponent. Always in decimal. Again, never
107  // negative.
108  TYPE_STRING, // A quoted sequence of escaped characters. Either single
109  // or double quotes can be used, but they must match.
110  // A string literal cannot cross a line break.
111  TYPE_SYMBOL, // Any other printable character, like '!' or '+'.
112  // Symbols are always a single character, so "!+$%" is
113  // four tokens.
114  };
115 
116  // Structure representing a token read from the token stream.
117  struct Token {
119  string text; // The exact text of the token as it appeared in
120  // the input. e.g. tokens of TYPE_STRING will still
121  // be escaped and in quotes.
122 
123  // "line" and "column" specify the position of the first character of
124  // the token within the input stream. They are zero-based.
125  int line;
126  int column;
128  };
129 
130  // Get the current token. This is updated when Next() is called. Before
131  // the first call to Next(), current() has type TYPE_START and no contents.
132  const Token& current();
133 
134  // Return the previous token -- i.e. what current() returned before the
135  // previous call to Next().
136  const Token& previous();
137 
138  // Advance to the next token. Returns false if the end of the input is
139  // reached.
140  bool Next();
141 
142  // Like Next(), but also collects comments which appear between the previous
143  // and next tokens.
144  //
145  // Comments which appear to be attached to the previous token are stored
146  // in *prev_tailing_comments. Comments which appear to be attached to the
147  // next token are stored in *next_leading_comments. Comments appearing in
148  // between which do not appear to be attached to either will be added to
149  // detached_comments. Any of these parameters can be NULL to simply discard
150  // the comments.
151  //
152  // A series of line comments appearing on consecutive lines, with no other
153  // tokens appearing on those lines, will be treated as a single comment.
154  //
155  // Only the comment content is returned; comment markers (e.g. //) are
156  // stripped out. For block comments, leading whitespace and an asterisk will
157  // be stripped from the beginning of each line other than the first. Newlines
158  // are included in the output.
159  //
160  // Examples:
161  //
162  // optional int32 foo = 1; // Comment attached to foo.
163  // // Comment attached to bar.
164  // optional int32 bar = 2;
165  //
166  // optional string baz = 3;
167  // // Comment attached to baz.
168  // // Another line attached to baz.
169  //
170  // // Comment attached to qux.
171  // //
172  // // Another line attached to qux.
173  // optional double qux = 4;
174  //
175  // // Detached comment. This is not attached to qux or corge
176  // // because there are blank lines separating it from both.
177  //
178  // optional string corge = 5;
179  // /* Block comment attached
180  // * to corge. Leading asterisks
181  // * will be removed. */
182  // /* Block comment attached to
183  // * grault. */
184  // optional int32 grault = 6;
185  bool NextWithComments(string* prev_trailing_comments,
186  vector<string>* detached_comments,
187  string* next_leading_comments);
188 
189  // Parse helpers ---------------------------------------------------
190 
191  // Parses a TYPE_FLOAT token. This never fails, so long as the text actually
192  // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the
193  // result is undefined (possibly an assert failure).
194  static double ParseFloat(const string& text);
195 
196  // Parses a TYPE_STRING token. This never fails, so long as the text actually
197  // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
198  // result is undefined (possibly an assert failure).
199  static void ParseString(const string& text, string* output);
200 
201  // Identical to ParseString, but appends to output.
202  static void ParseStringAppend(const string& text, string* output);
203 
204  // Parses a TYPE_INTEGER token. Returns false if the result would be
205  // greater than max_value. Otherwise, returns true and sets *output to the
206  // result. If the text is not from a Token of type TYPE_INTEGER originally
207  // parsed by a Tokenizer, the result is undefined (possibly an assert
208  // failure).
209  static bool ParseInteger(const string& text, uint64 max_value,
210  uint64* output);
211 
212  // Options ---------------------------------------------------------
213 
214  // Set true to allow floats to be suffixed with the letter 'f'. Tokens
215  // which would otherwise be integers but which have the 'f' suffix will be
216  // forced to be interpreted as floats. For all other purposes, the 'f' is
217  // ignored.
218  void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
219 
220  // Valid values for set_comment_style().
222  // Line comments begin with "//", block comments are delimited by "/*" and
223  // "*/".
225  // Line comments begin with "#". No way to write block comments.
226  SH_COMMENT_STYLE
227  };
228 
229  // Sets the comment style.
230  void set_comment_style(CommentStyle style) { comment_style_ = style; }
231 
232  // Whether to require whitespace between a number and a field name.
233  // Default is true. Do not use this; for Google-internal cleanup only.
234  void set_require_space_after_number(bool require) {
235  require_space_after_number_ = require;
236  }
237 
238  // Whether to allow string literals to span multiple lines. Default is false.
239  // Do not use this; for Google-internal cleanup only.
240  void set_allow_multiline_strings(bool allow) {
241  allow_multiline_strings_ = allow;
242  }
243 
244  // External helper: validate an identifier.
245  static bool IsIdentifier(const string& text);
246 
247  // -----------------------------------------------------------------
248  private:
250 
251  Token current_; // Returned by current().
252  Token previous_; // Returned by previous().
253 
256 
257  char current_char_; // == buffer_[buffer_pos_], updated by NextChar().
258  const char* buffer_; // Current buffer returned from input_.
259  int buffer_size_; // Size of buffer_.
260  int buffer_pos_; // Current position within the buffer.
261  bool read_error_; // Did we previously encounter a read error?
262 
263  // Line and column number of current_char_ within the whole input stream.
264  int line_;
265  int column_;
266 
267  // String to which text should be appended as we advance through it.
268  // Call RecordTo(&str) to start recording and StopRecording() to stop.
269  // E.g. StartToken() calls RecordTo(&current_.text). record_start_ is the
270  // position within the current buffer where recording started.
271  string* record_target_;
273 
274  // Options.
279 
280  // Since we count columns we need to interpret tabs somehow. We'll take
281  // the standard 8-character definition for lack of any way to do better.
282  static const int kTabWidth = 8;
283 
284  // -----------------------------------------------------------------
285  // Helper methods.
286 
287  // Consume this character and advance to the next one.
288  void NextChar();
289 
290  // Read a new buffer from the input.
291  void Refresh();
292 
293  inline void RecordTo(string* target);
294  inline void StopRecording();
295 
296  // Called when the current character is the first character of a new
297  // token (not including whitespace or comments).
298  inline void StartToken();
299  // Called when the current character is the first character after the
300  // end of the last token. After this returns, current_.text will
301  // contain all text consumed since StartToken() was called.
302  inline void EndToken();
303 
304  // Convenience method to add an error at the current line and column.
305  void AddError(const string& message) {
306  error_collector_->AddError(line_, column_, message);
307  }
308 
309  // -----------------------------------------------------------------
310  // The following four methods are used to consume tokens of specific
311  // types. They are actually used to consume all characters *after*
312  // the first, since the calling function consumes the first character
313  // in order to decide what kind of token is being read.
314 
315  // Read and consume a string, ending when the given delimiter is
316  // consumed.
317  void ConsumeString(char delimiter);
318 
319  // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
320  // depending on what was read. This needs to know if the first
321  // character was a zero in order to correctly recognize hex and octal
322  // numbers.
323  // It also needs to know if the first characted was a . to parse floating
324  // point correctly.
325  TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
326 
327  // Consume the rest of a line.
328  void ConsumeLineComment(string* content);
329  // Consume until "*/".
330  void ConsumeBlockComment(string* content);
331 
333  // Started a line comment.
335 
336  // Started a block comment.
338 
339  // Consumed a slash, then realized it wasn't a comment. current_ has
340  // been filled in with a slash token. The caller should return it.
342 
343  // We do not appear to be starting a comment here.
344  NO_COMMENT
345  };
346 
347  // If we're at the start of a new comment, consume it and return what kind
348  // of comment it is.
349  NextCommentStatus TryConsumeCommentStart();
350 
351  // -----------------------------------------------------------------
352  // These helper methods make the parsing code more readable. The
353  // "character classes" refered to are defined at the top of the .cc file.
354  // Basically it is a C++ class with one method:
355  // static bool InClass(char c);
356  // The method returns true if c is a member of this "class", like "Letter"
357  // or "Digit".
358 
359  // Returns true if the current character is of the given character
360  // class, but does not consume anything.
361  template<typename CharacterClass>
362  inline bool LookingAt();
363 
364  // If the current character is in the given class, consume it and return
365  // true. Otherwise return false.
366  // e.g. TryConsumeOne<Letter>()
367  template<typename CharacterClass>
368  inline bool TryConsumeOne();
369 
370  // Like above, but try to consume the specific character indicated.
371  inline bool TryConsume(char c);
372 
373  // Consume zero or more of the given character class.
374  template<typename CharacterClass>
375  inline void ConsumeZeroOrMore();
376 
377  // Consume one or more of the given character class or log the given
378  // error message.
379  // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
380  template<typename CharacterClass>
381  inline void ConsumeOneOrMore(const char* error);
382 };
383 
384 // inline methods ====================================================
386  return current_;
387 }
388 
390  return previous_;
391 }
392 
393 inline void Tokenizer::ParseString(const string& text, string* output) {
394  output->clear();
395  ParseStringAppend(text, output);
396 }
397 
398 } // namespace io
399 } // namespace protobuf
400 
401 } // namespace google
402 #endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
Definition: tokenizer.h:57
void AddError(const string &message)
Definition: tokenizer.h:305
int line_
Definition: tokenizer.h:264
CommentStyle comment_style_
Definition: tokenizer.h:276
Definition: zero_copy_stream.h:124
Definition: Util.h:45
int column
Definition: tokenizer.h:126
int line
Definition: tokenizer.h:125
int end_column
Definition: tokenizer.h:127
virtual void AddError(int line, int column, const string &message)=0
int buffer_pos_
Definition: tokenizer.h:260
Token previous_
Definition: tokenizer.h:252
Token current_
Definition: tokenizer.h:251
Definition: tokenizer.h:83
#define GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(TypeName)
Definition: common.h:89
void set_require_space_after_number(bool require)
Definition: tokenizer.h:234
void set_allow_multiline_strings(bool allow)
Definition: tokenizer.h:240
int record_start_
Definition: tokenizer.h:272
string text
Definition: tokenizer.h:119
TokenType type
Definition: tokenizer.h:118
#define output
Definition: wire_format_lite.h:381
bool allow_multiline_strings_
Definition: tokenizer.h:278
const char * buffer_
Definition: tokenizer.h:258
static void ParseString(const string &text, string *output)
Definition: tokenizer.h:393
void set_allow_f_after_float(bool value)
Definition: tokenizer.h:218
TokenType
Definition: tokenizer.h:91
bool require_space_after_number_
Definition: tokenizer.h:277
void set_comment_style(CommentStyle style)
Definition: tokenizer.h:230
static void ParseStringAppend(const string &text, string *output)
ErrorCollector * error_collector_
Definition: tokenizer.h:255
string * record_target_
Definition: tokenizer.h:271
#define input
Definition: wire_format_lite.h:242
virtual void AddWarning(int, int, const string &)
Definition: tokenizer.h:70
NextCommentStatus
Definition: tokenizer.h:332
uint32_t previous(octet_iterator &it, octet_iterator pass_start)
Deprecated in versions that include "prior".
Definition: checked.h:179
char current_char_
Definition: tokenizer.h:257
uint64_t uint64
Definition: common.h:178
const Token & current()
Definition: tokenizer.h:385
bool read_error_
Definition: tokenizer.h:261
#define LIBPROTOBUF_EXPORT
Definition: common.h:105
const Token & previous()
Definition: tokenizer.h:389
ErrorCollector()
Definition: tokenizer.h:59
bool allow_f_after_float_
Definition: tokenizer.h:275
Definition: BnetFileGenerator.h:47
const FieldDescriptor value
Definition: descriptor.h:1522
ZeroCopyInputStream * input_
Definition: tokenizer.h:254
Definition: tokenizer.h:117
int buffer_size_
Definition: tokenizer.h:259
int column_
Definition: tokenizer.h:265
CommentStyle
Definition: tokenizer.h:221