| // Copyright 2016 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // This file implements source, a buffered rune reader |
| // specialized for scanning Go code: Reading |
| // ASCII characters, maintaining current (line, col) |
| // position information, and recording of the most |
| // recently read source segment are highly optimized. |
| // This file is self-contained (go tool compile source.go |
| // compiles) and thus could be made into its own package. |
| |
| package syntax |
| |
| import ( |
| "io" |
| "unicode/utf8" |
| ) |
| |
| // The source buffer is accessed using three indices b (begin), |
| // r (read), and e (end): |
| // |
| // - If b >= 0, it points to the beginning of a segment of most |
| // recently read characters (typically a Go literal). |
| // |
| // - r points to the byte immediately following the most recently |
| // read character ch, which starts at r-chw. |
| // |
| // - e points to the byte immediately following the last byte that |
| // was read into the buffer. |
| // |
| // The buffer content is terminated at buf[e] with the sentinel |
| // character utf8.RuneSelf. This makes it possible to test for |
| // the common case of ASCII characters with a single 'if' (see |
| // nextch method). |
| // |
| // +------ content in use -------+ |
| // v v |
| // buf [...read...|...segment...|ch|...unread...|s|...free...] |
| // ^ ^ ^ ^ |
| // | | | | |
| // b r-chw r e |
| // |
| // Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel |
| |
| type source struct { |
| in io.Reader |
| errh func(line, col uint, msg string) |
| |
| buf []byte // source buffer |
| ioerr error // pending I/O error, or nil |
| b, r, e int // buffer indices (see comment above) |
| line, col uint // source position of ch (0-based) |
| ch rune // most recently read character |
| chw int // width of ch |
| } |
| |
| const sentinel = utf8.RuneSelf |
| |
| func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) { |
| s.in = in |
| s.errh = errh |
| |
| if s.buf == nil { |
| s.buf = make([]byte, nextSize(0)) |
| } |
| s.buf[0] = sentinel |
| s.ioerr = nil |
| s.b, s.r, s.e = -1, 0, 0 |
| s.line, s.col = 0, 0 |
| s.ch = ' ' |
| s.chw = 0 |
| } |
| |
| // starting points for line and column numbers |
| const linebase = 1 |
| const colbase = 1 |
| |
| // pos returns the (line, col) source position of s.ch. |
| func (s *source) pos() (line, col uint) { |
| return linebase + s.line, colbase + s.col |
| } |
| |
| // error reports the error msg at source position s.pos(). |
| func (s *source) error(msg string) { |
| line, col := s.pos() |
| s.errh(line, col, msg) |
| } |
| |
| // start starts a new active source segment (including s.ch). |
| // As long as stop has not been called, the active segment's |
| // bytes (excluding s.ch) may be retrieved by calling segment. |
| func (s *source) start() { s.b = s.r - s.chw } |
| func (s *source) stop() { s.b = -1 } |
| func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] } |
| |
| // rewind rewinds the scanner's read position and character s.ch |
| // to the start of the currently active segment, which must not |
| // contain any newlines (otherwise position information will be |
| // incorrect). Currently, rewind is only needed for handling the |
| // source sequence ".."; it must not be called outside an active |
| // segment. |
| func (s *source) rewind() { |
| // ok to verify precondition - rewind is rarely called |
| if s.b < 0 { |
| panic("no active segment") |
| } |
| s.col -= uint(s.r - s.b) |
| s.r = s.b |
| s.nextch() |
| } |
| |
| func (s *source) nextch() { |
| redo: |
| s.col += uint(s.chw) |
| if s.ch == '\n' { |
| s.line++ |
| s.col = 0 |
| } |
| |
| // fast common case: at least one ASCII character |
| if s.ch = rune(s.buf[s.r]); s.ch < sentinel { |
| s.r++ |
| s.chw = 1 |
| if s.ch == 0 { |
| s.error("invalid NUL character") |
| goto redo |
| } |
| return |
| } |
| |
| // slower general case: add more bytes to buffer if we don't have a full rune |
| for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil { |
| s.fill() |
| } |
| |
| // EOF |
| if s.r == s.e { |
| if s.ioerr != io.EOF { |
| // ensure we never start with a '/' (e.g., rooted path) in the error message |
| s.error("I/O error: " + s.ioerr.Error()) |
| s.ioerr = nil |
| } |
| s.ch = -1 |
| s.chw = 0 |
| return |
| } |
| |
| s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e]) |
| s.r += s.chw |
| |
| if s.ch == utf8.RuneError && s.chw == 1 { |
| s.error("invalid UTF-8 encoding") |
| goto redo |
| } |
| |
| // BOM's are only allowed as the first character in a file |
| const BOM = 0xfeff |
| if s.ch == BOM { |
| if s.line > 0 || s.col > 0 { |
| s.error("invalid BOM in the middle of the file") |
| } |
| goto redo |
| } |
| } |
| |
| // fill reads more source bytes into s.buf. |
| // It returns with at least one more byte in the buffer, or with s.ioerr != nil. |
| func (s *source) fill() { |
| // determine content to preserve |
| b := s.r |
| if s.b >= 0 { |
| b = s.b |
| s.b = 0 // after buffer has grown or content has been moved down |
| } |
| content := s.buf[b:s.e] |
| |
| // grow buffer or move content down |
| if len(content)*2 > len(s.buf) { |
| s.buf = make([]byte, nextSize(len(s.buf))) |
| copy(s.buf, content) |
| } else if b > 0 { |
| copy(s.buf, content) |
| } |
| s.r -= b |
| s.e -= b |
| |
| // read more data: try a limited number of times |
| for i := 0; i < 10; i++ { |
| var n int |
| n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel |
| if n < 0 { |
| panic("negative read") // incorrect underlying io.Reader implementation |
| } |
| if n > 0 || s.ioerr != nil { |
| s.e += n |
| s.buf[s.e] = sentinel |
| return |
| } |
| // n == 0 |
| } |
| |
| s.buf[s.e] = sentinel |
| s.ioerr = io.ErrNoProgress |
| } |
| |
| // nextSize returns the next bigger size for a buffer of a given size. |
| func nextSize(size int) int { |
| const min = 4 << 10 // 4K: minimum buffer size |
| const max = 1 << 20 // 1M: maximum buffer size which is still doubled |
| if size < min { |
| return min |
| } |
| if size <= max { |
| return size << 1 |
| } |
| return size + max |
| } |