1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package scanner
16
17 import (
18 "bytes"
19 "fmt"
20 "io"
21 "os"
22 "unicode"
23 "unicode/utf8"
24 )
25
26
27
28 type Position struct {
29 Filename string
30 Offset int
31 Line int
32 Column int
33 }
34
35
36 func (pos *Position) IsValid() bool { return pos.Line > 0 }
37
38 func (pos Position) String() string {
39 s := pos.Filename
40 if s == "" {
41 s = "<input>"
42 }
43 if pos.IsValid() {
44 s += fmt.Sprintf(":%d:%d", pos.Line, pos.Column)
45 }
46 return s
47 }
48
49
50
51
52
53
54
55
56
57
58
59
60
61 const (
62 ScanIdents = 1 << -Ident
63 ScanInts = 1 << -Int
64 ScanFloats = 1 << -Float
65 ScanChars = 1 << -Char
66 ScanStrings = 1 << -String
67 ScanRawStrings = 1 << -RawString
68 ScanComments = 1 << -Comment
69 SkipComments = 1 << -skipComment
70 GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
71 )
72
73
74 const (
75 EOF = -(iota + 1)
76 Ident
77 Int
78 Float
79 Char
80 String
81 RawString
82 Comment
83 skipComment
84 )
85
86 var tokenString = map[rune]string{
87 EOF: "EOF",
88 Ident: "Ident",
89 Int: "Int",
90 Float: "Float",
91 Char: "Char",
92 String: "String",
93 RawString: "RawString",
94 Comment: "Comment",
95 }
96
97
98 func TokenString(tok rune) string {
99 if s, found := tokenString[tok]; found {
100 return s
101 }
102 return fmt.Sprintf("%q", string(tok))
103 }
104
105
106
107 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
108
109 const bufLen = 1024
110
111
112 type Scanner struct {
113
114 src io.Reader
115
116
117 srcBuf [bufLen + 1]byte
118 srcPos int
119 srcEnd int
120
121
122 srcBufOffset int
123 line int
124 column int
125 lastLineLen int
126 lastCharLen int
127
128
129
130
131
132 tokBuf bytes.Buffer
133 tokPos int
134 tokEnd int
135
136
137 ch rune
138
139
140
141 Error func(s *Scanner, msg string)
142
143
144 ErrorCount int
145
146
147
148
149 Mode uint
150
151
152
153
154
155 Whitespace uint64
156
157
158
159
160
161
162 IsIdentRune func(ch rune, i int) bool
163
164
165
166
167
168
169
170
171 Position
172 }
173
174
175
176
177 func (s *Scanner) Init(src io.Reader) *Scanner {
178 s.src = src
179
180
181
182 s.srcBuf[0] = utf8.RuneSelf
183 s.srcPos = 0
184 s.srcEnd = 0
185
186
187 s.srcBufOffset = 0
188 s.line = 1
189 s.column = 0
190 s.lastLineLen = 0
191 s.lastCharLen = 0
192
193
194
195 s.tokPos = -1
196
197
198 s.ch = -2
199
200
201 s.Error = nil
202 s.ErrorCount = 0
203 s.Mode = GoTokens
204 s.Whitespace = GoWhitespace
205 s.Line = 0
206
207 return s
208 }
209
210
211
212
213
214 func (s *Scanner) next() rune {
215 ch, width := rune(s.srcBuf[s.srcPos]), 1
216
217 if ch >= utf8.RuneSelf {
218
219 for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
220
221
222 if s.tokPos >= 0 {
223 s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
224 s.tokPos = 0
225
226 }
227
228 copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
229 s.srcBufOffset += s.srcPos
230
231
232
233
234
235 i := s.srcEnd - s.srcPos
236 n, err := s.src.Read(s.srcBuf[i:bufLen])
237 s.srcPos = 0
238 s.srcEnd = i + n
239 s.srcBuf[s.srcEnd] = utf8.RuneSelf
240 if err != nil {
241 if err != io.EOF {
242 s.error(err.Error())
243 }
244 if s.srcEnd == 0 {
245 if s.lastCharLen > 0 {
246
247 s.column++
248 }
249 s.lastCharLen = 0
250 return EOF
251 }
252
253
254
255
256 break
257 }
258 }
259
260 ch = rune(s.srcBuf[s.srcPos])
261 if ch >= utf8.RuneSelf {
262
263 ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
264 if ch == utf8.RuneError && width == 1 {
265
266 s.srcPos += width
267 s.lastCharLen = width
268 s.column++
269 s.error("illegal UTF-8 encoding")
270 return ch
271 }
272 }
273 }
274
275
276 s.srcPos += width
277 s.lastCharLen = width
278 s.column++
279
280
281 switch ch {
282 case 0:
283
284 s.error("illegal character NUL")
285 case '\n':
286 s.line++
287 s.lastLineLen = s.column
288 s.column = 0
289 }
290
291 return ch
292 }
293
294
295
296
297
298
299
300 func (s *Scanner) Next() rune {
301 s.tokPos = -1
302 s.Line = 0
303 ch := s.Peek()
304 if ch != EOF {
305 s.ch = s.next()
306 }
307 return ch
308 }
309
310
311
312
313 func (s *Scanner) Peek() rune {
314 if s.ch == -2 {
315
316 s.ch = s.next()
317 if s.ch == '\uFEFF' {
318 s.ch = s.next()
319 }
320 }
321 return s.ch
322 }
323
324 func (s *Scanner) error(msg string) {
325 s.ErrorCount++
326 if s.Error != nil {
327 s.Error(s, msg)
328 return
329 }
330 pos := s.Position
331 if !pos.IsValid() {
332 pos = s.Pos()
333 }
334 fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
335 }
336
337 func (s *Scanner) isIdentRune(ch rune, i int) bool {
338 if s.IsIdentRune != nil {
339 return s.IsIdentRune(ch, i)
340 }
341 return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0
342 }
343
344 func (s *Scanner) scanIdentifier() rune {
345
346 ch := s.next()
347 for i := 1; s.isIdentRune(ch, i); i++ {
348 ch = s.next()
349 }
350 return ch
351 }
352
353 func digitVal(ch rune) int {
354 switch {
355 case '0' <= ch && ch <= '9':
356 return int(ch - '0')
357 case 'a' <= ch && ch <= 'f':
358 return int(ch - 'a' + 10)
359 case 'A' <= ch && ch <= 'F':
360 return int(ch - 'A' + 10)
361 }
362 return 16
363 }
364
365 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
366
367 func (s *Scanner) scanMantissa(ch rune) rune {
368 for isDecimal(ch) {
369 ch = s.next()
370 }
371 return ch
372 }
373
374 func (s *Scanner) scanFraction(ch rune) rune {
375 if ch == '.' {
376 ch = s.scanMantissa(s.next())
377 }
378 return ch
379 }
380
381 func (s *Scanner) scanExponent(ch rune) rune {
382 if ch == 'e' || ch == 'E' {
383 ch = s.next()
384 if ch == '-' || ch == '+' {
385 ch = s.next()
386 }
387 ch = s.scanMantissa(ch)
388 }
389 return ch
390 }
391
392 func (s *Scanner) scanNumber(ch rune) (rune, rune) {
393
394 if ch == '0' {
395
396 ch = s.next()
397 if ch == 'x' || ch == 'X' {
398
399 ch = s.next()
400 hasMantissa := false
401 for digitVal(ch) < 16 {
402 ch = s.next()
403 hasMantissa = true
404 }
405 if !hasMantissa {
406 s.error("illegal hexadecimal number")
407 }
408 } else {
409
410 has8or9 := false
411 for isDecimal(ch) {
412 if ch > '7' {
413 has8or9 = true
414 }
415 ch = s.next()
416 }
417 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
418
419 ch = s.scanFraction(ch)
420 ch = s.scanExponent(ch)
421 return Float, ch
422 }
423
424 if has8or9 {
425 s.error("illegal octal number")
426 }
427 }
428 return Int, ch
429 }
430
431 ch = s.scanMantissa(ch)
432 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
433
434 ch = s.scanFraction(ch)
435 ch = s.scanExponent(ch)
436 return Float, ch
437 }
438 return Int, ch
439 }
440
441 func (s *Scanner) scanDigits(ch rune, base, n int) rune {
442 for n > 0 && digitVal(ch) < base {
443 ch = s.next()
444 n--
445 }
446 if n > 0 {
447 s.error("illegal char escape")
448 }
449 return ch
450 }
451
452 func (s *Scanner) scanEscape(quote rune) rune {
453 ch := s.next()
454 switch ch {
455 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
456
457 ch = s.next()
458 case '0', '1', '2', '3', '4', '5', '6', '7':
459 ch = s.scanDigits(ch, 8, 3)
460 case 'x':
461 ch = s.scanDigits(s.next(), 16, 2)
462 case 'u':
463 ch = s.scanDigits(s.next(), 16, 4)
464 case 'U':
465 ch = s.scanDigits(s.next(), 16, 8)
466 default:
467 s.error("illegal char escape")
468 }
469 return ch
470 }
471
472 func (s *Scanner) scanString(quote rune) (n int) {
473 ch := s.next()
474 for ch != quote {
475 if ch == '\n' || ch < 0 {
476 s.error("literal not terminated")
477 return
478 }
479 if ch == '\\' {
480 ch = s.scanEscape(quote)
481 } else {
482 ch = s.next()
483 }
484 n++
485 }
486 return
487 }
488
489 func (s *Scanner) scanRawString() {
490 ch := s.next()
491 for ch != '`' {
492 if ch < 0 {
493 s.error("literal not terminated")
494 return
495 }
496 ch = s.next()
497 }
498 }
499
500 func (s *Scanner) scanChar() {
501 if s.scanString('\'') != 1 {
502 s.error("illegal char literal")
503 }
504 }
505
506 func (s *Scanner) scanComment(ch rune) rune {
507
508 if ch == '/' {
509
510 ch = s.next()
511 for ch != '\n' && ch >= 0 {
512 ch = s.next()
513 }
514 return ch
515 }
516
517
518 ch = s.next()
519 for {
520 if ch < 0 {
521 s.error("comment not terminated")
522 break
523 }
524 ch0 := ch
525 ch = s.next()
526 if ch0 == '*' && ch == '/' {
527 ch = s.next()
528 break
529 }
530 }
531 return ch
532 }
533
534
535
536
537
538
539 func (s *Scanner) Scan() rune {
540 ch := s.Peek()
541
542
543 s.tokPos = -1
544 s.Line = 0
545
546 redo:
547
548 for s.Whitespace&(1<<uint(ch)) != 0 {
549 ch = s.next()
550 }
551
552
553 s.tokBuf.Reset()
554 s.tokPos = s.srcPos - s.lastCharLen
555
556
557
558 s.Offset = s.srcBufOffset + s.tokPos
559 if s.column > 0 {
560
561 s.Line = s.line
562 s.Column = s.column
563 } else {
564
565
566
567 s.Line = s.line - 1
568 s.Column = s.lastLineLen
569 }
570
571
572 tok := ch
573 switch {
574 case s.isIdentRune(ch, 0):
575 if s.Mode&ScanIdents != 0 {
576 tok = Ident
577 ch = s.scanIdentifier()
578 } else {
579 ch = s.next()
580 }
581 case isDecimal(ch):
582 if s.Mode&(ScanInts|ScanFloats) != 0 {
583 tok, ch = s.scanNumber(ch)
584 } else {
585 ch = s.next()
586 }
587 default:
588 switch ch {
589 case EOF:
590 break
591 case '"':
592 if s.Mode&ScanStrings != 0 {
593 s.scanString('"')
594 tok = String
595 }
596 ch = s.next()
597 case '\'':
598 if s.Mode&ScanChars != 0 {
599 s.scanChar()
600 tok = Char
601 }
602 ch = s.next()
603 case '.':
604 ch = s.next()
605 if isDecimal(ch) && s.Mode&ScanFloats != 0 {
606 tok = Float
607 ch = s.scanMantissa(ch)
608 ch = s.scanExponent(ch)
609 }
610 case '/':
611 ch = s.next()
612 if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 {
613 if s.Mode&SkipComments != 0 {
614 s.tokPos = -1
615 ch = s.scanComment(ch)
616 goto redo
617 }
618 ch = s.scanComment(ch)
619 tok = Comment
620 }
621 case '`':
622 if s.Mode&ScanRawStrings != 0 {
623 s.scanRawString()
624 tok = String
625 }
626 ch = s.next()
627 default:
628 ch = s.next()
629 }
630 }
631
632
633 s.tokEnd = s.srcPos - s.lastCharLen
634
635 s.ch = ch
636 return tok
637 }
638
639
640
641
642
643 func (s *Scanner) Pos() (pos Position) {
644 pos.Filename = s.Filename
645 pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen
646 switch {
647 case s.column > 0:
648
649 pos.Line = s.line
650 pos.Column = s.column
651 case s.lastLineLen > 0:
652
653 pos.Line = s.line - 1
654 pos.Column = s.lastLineLen
655 default:
656
657 pos.Line = 1
658 pos.Column = 1
659 }
660 return
661 }
662
663
664
665 func (s *Scanner) TokenText() string {
666 if s.tokPos < 0 {
667
668 return ""
669 }
670
671 if s.tokEnd < 0 {
672
673 s.tokEnd = s.tokPos
674 }
675
676 if s.tokBuf.Len() == 0 {
677
678 return string(s.srcBuf[s.tokPos:s.tokEnd])
679 }
680
681
682
683 s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd])
684 s.tokPos = s.tokEnd
685 return s.tokBuf.String()
686 }
687
View as plain text