-
Notifications
You must be signed in to change notification settings - Fork 5.9k
/
api.go
410 lines (373 loc) · 10.4 KB
/
api.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
// Copyright (c) 2015 The golex Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package lex
import (
"bytes"
"fmt"
"go/token"
"io"
"os"
)
// BOM handling modes which can be set by the BOMMode Option. Default is BOMIgnoreFirst.
const (
BOMError = iota // BOM is an error anywhere.
BOMIgnoreFirst // Skip BOM if at beginning, report as error if anywhere else.
BOMPassAll // No special handling of BOM.
BOMPassFirst // No special handling of BOM if at beginning, report as error if anywhere else.
)
const (
NonASCII = 0x80 // DefaultRuneClass returns NonASCII for non ASCII runes.
RuneEOF = -1 // Distinct from any valid Unicode rune value.
)
// DefaultRuneClass returns the character class of r. If r is an ASCII code
// then its class equals the ASCII code. Any other rune is of class NonASCII.
//
// DefaultRuneClass is the default implementation Lexer will use to convert
// runes (21 bit entities) to scanner classes (8 bit entities).
//
// Non ASCII aware lexical analyzers will typically use their own
// categorization function. To assign such custom function use the RuneClass
// option.
func DefaultRuneClass(r rune) int {
if r >= 0 && r < 0x80 {
return int(r)
}
return NonASCII
}
// Char represents a rune and its position.
type Char struct {
Rune rune
pos int32
}
// NewChar returns a new Char value.
func NewChar(pos token.Pos, r rune) Char { return Char{pos: int32(pos), Rune: r} }
// IsValid reports whether c is not a zero Char.
func (c Char) IsValid() bool { return c.Pos().IsValid() }
// Pos returns the token.Pos associated with c.
func (c Char) Pos() token.Pos { return token.Pos(c.pos) }
// CharReader is a RuneReader providing additionally explicit position
// information by returning a Char instead of a rune as its first result.
type CharReader interface {
ReadChar() (c Char, size int, err error)
}
// Lexer suports golex[0] generated lexical analyzers.
type Lexer struct {
File *token.File // The *token.File passed to New.
First Char // First remembers the lookahead char when Rule0 was invoked.
Last Char // Last remembers the last Char returned by Next.
Prev Char // Prev remembers the Char previous to Last.
bomMode int // See the BOM* constants.
bytesBuf bytes.Buffer // Used by TokenBytes.
charSrc CharReader // Lexer alternative input.
classf func(rune) int //
errorf func(token.Pos, string) //
lookahead Char // Lookahead if non zero.
mark int // Longest match marker.
off int // Used for File.AddLine.
src io.RuneReader // Lexer input.
tokenBuf []Char // Lexeme collector.
ungetBuf []Char // Unget buffer.
}
// New returns a new *Lexer. The result can be amended using opts.
//
// Non Unicode Input
//
// To consume sources in other encodings and still have exact position
// information, pass an io.RuneReader which returns the next input character
// reencoded as an Unicode rune but returns the size (number of bytes used to
// encode it) of the original character, not the size of its UTF-8
// representation after converted to an Unicode rune. Size is the second
// returned value of io.RuneReader.ReadRune method[4].
//
// When src optionally implements CharReader its ReadChar method is used
// instead of io.ReadRune.
func New(file *token.File, src io.RuneReader, opts ...Option) (*Lexer, error) {
r := &Lexer{
File: file,
bomMode: BOMIgnoreFirst,
classf: DefaultRuneClass,
src: src,
}
if x, ok := src.(CharReader); ok {
r.charSrc = x
}
r.errorf = r.defaultErrorf
for _, o := range opts {
if err := o(r); err != nil {
return nil, err
}
}
return r, nil
}
// Abort handles the situation when the scanner does not successfully recognize
// any token or when an attempt to find the longest match "overruns" from an
// accepting state only to never reach an accepting state again. In the first
// case the scanner was never in an accepting state since last call to Rule0
// and then (true, previousLookahead rune) is returned, effectively consuming a
// single Char token, avoiding scanner stall. Otherwise there was at least one
// accepting scanner state marked using Mark. In this case Abort rollbacks the
// lexer state to the marked state and returns (false, 0). The scanner must
// then execute a prescribed goto statement. For example:
//
// %yyc c
// %yyn c = l.Next()
// %yym l.Mark()
//
// %{
// package foo
//
// import (...)
//
// type lexer struct {
// *lex.Lexer
// ...
// }
//
// func newLexer(...) *lexer {
// return &lexer{
// lex.NewLexer(...),
// ...
// }
// }
//
// func (l *lexer) scan() int {
// c := l.Enter()
// %}
//
// ... more lex defintions
//
// %%
//
// c = l.Rule0()
//
// ... lex rules
//
// %%
//
// if c, ok := l.Abort(); ok {
// return c
// }
//
// goto yyAction
// }
func (l *Lexer) Abort() (int, bool) {
if l.mark >= 0 {
if len(l.tokenBuf) > l.mark {
l.Unget(l.lookahead)
for i := len(l.tokenBuf) - 1; i >= l.mark; i-- {
l.Unget(l.tokenBuf[i])
}
}
l.tokenBuf = l.tokenBuf[:l.mark]
return 0, false
}
switch n := len(l.tokenBuf); n {
case 0: // [] z
c := l.lookahead
l.Next()
return int(c.Rune), true
case 1: // [a] z
return int(l.tokenBuf[0].Rune), true
default: // [a, b, ...], z
c := l.tokenBuf[0] // a
l.Unget(l.lookahead) // z
for i := n - 1; i > 1; i-- {
l.Unget(l.tokenBuf[i]) // ...
}
l.lookahead = l.tokenBuf[1] // b
l.tokenBuf = l.tokenBuf[:1]
return int(c.Rune), true
}
}
func (l *Lexer) class() int { return l.classf(l.lookahead.Rune) }
func (l *Lexer) defaultErrorf(pos token.Pos, msg string) {
l.Error(fmt.Sprintf("%v: %v", l.File.Position(pos), msg))
}
// Enter ensures the lexer has a valid lookahead Char and returns its class.
// Typical use in an .l file
//
// func (l *lexer) scan() lex.Char {
// c := l.Enter()
// ...
func (l *Lexer) Enter() int {
if !l.lookahead.IsValid() {
l.Next()
}
return l.class()
}
// Error Implements yyLexer[2] by printing the msg to stderr.
func (l *Lexer) Error(msg string) {
fmt.Fprintf(os.Stderr, "%s\n", msg)
}
// Lookahead returns the current lookahead.
func (l *Lexer) Lookahead() Char {
if !l.lookahead.IsValid() {
l.Next()
}
return l.lookahead
}
// Mark records the current state of scanner as accepting. It implements the
// golex macro %yym. Typical usage in an .l file:
//
// %yym l.Mark()
func (l *Lexer) Mark() { l.mark = len(l.tokenBuf) }
func (l *Lexer) next() int {
const bom = '\ufeff'
if c := l.lookahead; c.IsValid() {
l.tokenBuf = append(l.tokenBuf, c)
}
if n := len(l.ungetBuf); n != 0 {
l.lookahead = l.ungetBuf[n-1]
l.ungetBuf = l.ungetBuf[:n-1]
return l.class()
}
if l.src == nil {
return RuneEOF
}
var r rune
var sz int
var err error
var pos token.Pos
var c Char
again:
off0 := l.off
switch cs := l.charSrc; {
case cs != nil:
c, sz, err = cs.ReadChar()
r = c.Rune
pos = c.Pos()
default:
r, sz, err = l.src.ReadRune()
pos = l.File.Pos(l.off)
}
l.off += sz
if err != nil {
l.src = nil
r = RuneEOF
if err != io.EOF {
l.errorf(pos, err.Error())
}
}
if r == bom {
switch l.bomMode {
default:
fallthrough
case BOMIgnoreFirst:
if off0 != 0 {
l.errorf(pos, "unicode (UTF-8) BOM in middle of file")
}
goto again
case BOMPassAll:
// nop
case BOMPassFirst:
if off0 != 0 {
l.errorf(pos, "unicode (UTF-8) BOM in middle of file")
goto again
}
case BOMError:
switch {
case off0 == 0:
l.errorf(pos, "unicode (UTF-8) BOM at beginnig of file")
default:
l.errorf(pos, "unicode (UTF-8) BOM in middle of file")
}
goto again
}
}
l.lookahead = NewChar(pos, r)
if r == '\n' {
l.File.AddLine(l.off)
}
return l.class()
}
// Next advances the scanner for one rune and returns the respective character
// class of the new lookahead. Typical usage in an .l file:
//
// %yyn c = l.Next()
func (l *Lexer) Next() int {
l.Prev = l.Last
r := l.next()
l.Last = l.lookahead
return r
}
// Offset returns the current reading offset of the lexer's source.
func (l *Lexer) Offset() int { return l.off }
// Rule0 initializes the scanner state before the attempt to recognize a token
// starts. The token collecting buffer is cleared. Rule0 records the current
// lookahead in l.First and returns its class. Typical usage in an .l file:
//
// ... lex definitions
//
// %%
//
// c := l.Rule0()
//
// first-pattern-regexp
func (l *Lexer) Rule0() int {
if !l.lookahead.IsValid() {
l.Next()
}
l.First = l.lookahead
l.mark = -1
if len(l.tokenBuf) > 1<<18 { //DONE constant tuned
l.tokenBuf = nil
} else {
l.tokenBuf = l.tokenBuf[:0]
}
return l.class()
}
// Token returns the currently collected token chars. The result is R/O.
func (l *Lexer) Token() []Char { return l.tokenBuf }
// TokenBytes returns the UTF-8 encoding of Token. If builder is not nil then
// it's called instead to build the encoded token byte value into the buffer
// passed to it.
//
// The Result is R/O.
func (l *Lexer) TokenBytes(builder func(*bytes.Buffer)) []byte {
if len(l.bytesBuf.Bytes()) < 1<<18 { //DONE constant tuned
l.bytesBuf.Reset()
} else {
l.bytesBuf = bytes.Buffer{}
}
switch {
case builder != nil:
builder(&l.bytesBuf)
default:
for _, c := range l.Token() {
l.bytesBuf.WriteRune(c.Rune)
}
}
return l.bytesBuf.Bytes()
}
// Unget unreads all chars in c.
func (l *Lexer) Unget(c ...Char) {
l.ungetBuf = append(l.ungetBuf, c...)
l.lookahead = Char{} // Must invalidate lookahead.
}
// Option is a function which can be passed as an optional argument to New.
type Option func(*Lexer) error
// BOMMode option selects how the lexer handles BOMs. See the BOM* constants for details.
func BOMMode(mode int) Option {
return func(l *Lexer) error {
l.bomMode = mode
return nil
}
}
// ErrorFunc option sets a function called when an, for example I/O error,
// occurs. The default is to call Error with the position and message already
// formated as a string.
func ErrorFunc(f func(token.Pos, string)) Option {
return func(l *Lexer) error {
l.errorf = f
return nil
}
}
// RuneClass option sets the function used to convert runes to character
// classes.
func RuneClass(f func(rune) int) Option {
return func(l *Lexer) error {
l.classf = f
return nil
}
}