blob: 01b592152bb7886885e2b870f513d81d7d20ead0 [file] [log] [blame]
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This file implements source, a buffered rune reader
// specialized for scanning Go code: Reading
// ASCII characters, maintaining current (line, col)
// position information, and recording of the most
// recently read source segment are highly optimized.
// This file is self-contained (go tool compile source.go
// compiles) and thus could be made into its own package.
package syntax
import (
"io"
"unicode/utf8"
)
// The source buffer is accessed using three indices b (begin),
// r (read), and e (end):
//
// - If b >= 0, it points to the beginning of a segment of most
// recently read characters (typically a Go literal).
//
// - r points to the byte immediately following the most recently
// read character ch, which starts at r-chw.
//
// - e points to the byte immediately following the last byte that
// was read into the buffer.
//
// The buffer content is terminated at buf[e] with the sentinel
// character utf8.RuneSelf. This makes it possible to test for
// the common case of ASCII characters with a single 'if' (see
// nextch method).
//
// +------ content in use -------+
// v v
// buf [...read...|...segment...|ch|...unread...|s|...free...]
// ^ ^ ^ ^
// | | | |
// b r-chw r e
//
// Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel
type source struct {
in io.Reader
errh func(line, col uint, msg string)
buf []byte // source buffer
ioerr error // pending I/O error, or nil
b, r, e int // buffer indices (see comment above)
line, col uint // source position of ch (0-based)
ch rune // most recently read character
chw int // width of ch
}
const sentinel = utf8.RuneSelf
func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) {
s.in = in
s.errh = errh
if s.buf == nil {
s.buf = make([]byte, nextSize(0))
}
s.buf[0] = sentinel
s.ioerr = nil
s.b, s.r, s.e = -1, 0, 0
s.line, s.col = 0, 0
s.ch = ' '
s.chw = 0
}
// starting points for line and column numbers
const linebase = 1
const colbase = 1
// pos returns the (line, col) source position of s.ch.
func (s *source) pos() (line, col uint) {
return linebase + s.line, colbase + s.col
}
// error reports the error msg at source position s.pos().
func (s *source) error(msg string) {
line, col := s.pos()
s.errh(line, col, msg)
}
// start starts a new active source segment (including s.ch).
// As long as stop has not been called, the active segment's
// bytes (excluding s.ch) may be retrieved by calling segment.
func (s *source) start() { s.b = s.r - s.chw }
func (s *source) stop() { s.b = -1 }
func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] }
// rewind rewinds the scanner's read position and character s.ch
// to the start of the currently active segment, which must not
// contain any newlines (otherwise position information will be
// incorrect). Currently, rewind is only needed for handling the
// source sequence ".."; it must not be called outside an active
// segment.
func (s *source) rewind() {
// ok to verify precondition - rewind is rarely called
if s.b < 0 {
panic("no active segment")
}
s.col -= uint(s.r - s.b)
s.r = s.b
s.nextch()
}
func (s *source) nextch() {
redo:
s.col += uint(s.chw)
if s.ch == '\n' {
s.line++
s.col = 0
}
// fast common case: at least one ASCII character
if s.ch = rune(s.buf[s.r]); s.ch < sentinel {
s.r++
s.chw = 1
if s.ch == 0 {
s.error("invalid NUL character")
goto redo
}
return
}
// slower general case: add more bytes to buffer if we don't have a full rune
for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil {
s.fill()
}
// EOF
if s.r == s.e {
if s.ioerr != io.EOF {
// ensure we never start with a '/' (e.g., rooted path) in the error message
s.error("I/O error: " + s.ioerr.Error())
s.ioerr = nil
}
s.ch = -1
s.chw = 0
return
}
s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e])
s.r += s.chw
if s.ch == utf8.RuneError && s.chw == 1 {
s.error("invalid UTF-8 encoding")
goto redo
}
// BOM's are only allowed as the first character in a file
const BOM = 0xfeff
if s.ch == BOM {
if s.line > 0 || s.col > 0 {
s.error("invalid BOM in the middle of the file")
}
goto redo
}
}
// fill reads more source bytes into s.buf.
// It returns with at least one more byte in the buffer, or with s.ioerr != nil.
func (s *source) fill() {
// determine content to preserve
b := s.r
if s.b >= 0 {
b = s.b
s.b = 0 // after buffer has grown or content has been moved down
}
content := s.buf[b:s.e]
// grow buffer or move content down
if len(content)*2 > len(s.buf) {
s.buf = make([]byte, nextSize(len(s.buf)))
copy(s.buf, content)
} else if b > 0 {
copy(s.buf, content)
}
s.r -= b
s.e -= b
// read more data: try a limited number of times
for i := 0; i < 10; i++ {
var n int
n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel
if n < 0 {
panic("negative read") // incorrect underlying io.Reader implementation
}
if n > 0 || s.ioerr != nil {
s.e += n
s.buf[s.e] = sentinel
return
}
// n == 0
}
s.buf[s.e] = sentinel
s.ioerr = io.ErrNoProgress
}
// nextSize returns the next bigger size for a buffer of a given size.
func nextSize(size int) int {
const min = 4 << 10 // 4K: minimum buffer size
const max = 1 << 20 // 1M: maximum buffer size which is still doubled
if size < min {
return min
}
if size <= max {
return size << 1
}
return size + max
}