src/cmd/compile/internal/syntax/source.go - go - Git at Google

 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // This file implements source, a buffered rune reader
 // which is specialized for the needs of the Go scanner:
 // Contiguous sequences of runes (literals) are extracted
 // directly as []byte without the need to re-encode the
 // runes in UTF-8 (as would be necessary with bufio.Reader).
 //
 // This file is self-contained (go tool compile source.go
 // compiles) and thus could be made into its own package.

 package syntax

 import (
 	"io"
 	"unicode/utf8"
 )

 // starting points for line and column numbers
 const linebase = 1
 const colbase = 1

 // buf [...read...|...|...unread...|s|...free...]
 //         ^      ^   ^            ^
 //         |      |   |            |
 //        suf     r0  r            w

 type source struct {
 	src  io.Reader
 	errh func(line, pos uint, msg string)

 	// source buffer
 	buf         [4 << 10]byte
 	offs        int   // source offset of buf
 	r0, r, w    int   // previous/current read and write buf positions, excluding sentinel
 	line0, line uint  // previous/current line
 	col0, col   uint  // previous/current column (byte offsets from line start)
 	ioerr       error // pending io error

 	// literal buffer
 	lit []byte // literal prefix
 	suf int    // literal suffix; suf >= 0 means we are scanning a literal
 }

 // init initializes source to read from src and to report errors via errh.
 // errh must not be nil.
 func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) {
 	s.src = src
 	s.errh = errh

 	s.buf[0] = utf8.RuneSelf // terminate with sentinel
 	s.offs = 0
 	s.r0, s.r, s.w = 0, 0, 0
 	s.line0, s.line = 0, linebase
 	s.col0, s.col = 0, colbase
 	s.ioerr = nil

 	s.lit = s.lit[:0]
 	s.suf = -1
 }

 // ungetr ungets the most recently read rune.
 func (s *source) ungetr() {
 	s.r, s.line, s.col = s.r0, s.line0, s.col0
 }

 // ungetr2 is like ungetr but enables a 2nd ungetr.
 // It must not be called if one of the runes seen
 // was a newline.
 func (s *source) ungetr2() {
 	s.ungetr()
 	// line must not have changed
 	s.r0--
 	s.col0--
 }

 func (s *source) error(msg string) {
 	s.errh(s.line0, s.col0, msg)
 }

 // getr reads and returns the next rune.
 //
 // If a read or source encoding error occurs, getr
 // calls the error handler installed with init.
 // The handler must exist.
 //
 // The (line, col) position passed to the error handler
 // is always at the current source reading position.
 func (s *source) getr() rune {
 redo:
 	s.r0, s.line0, s.col0 = s.r, s.line, s.col

 	// We could avoid at least one test that is always taken in the
 	// for loop below by duplicating the common case code (ASCII)
 	// here since we always have at least the sentinel (utf8.RuneSelf)
 	// in the buffer. Measure and optimize if necessary.

 	// make sure we have at least one rune in buffer, or we are at EOF
 	for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) {
 		s.fill() // s.w-s.r < len(s.buf) => buffer is not full
 	}

 	// common case: ASCII and enough bytes
 	// (invariant: s.buf[s.w] == utf8.RuneSelf)
 	if b := s.buf[s.r]; b < utf8.RuneSelf {
 		s.r++
 		// TODO(gri) Optimization: Instead of adjusting s.col for each character,
 		// remember the line offset instead and then compute the offset as needed
 		// (which is less often).
 		s.col++
 		if b == 0 {
 			s.error("invalid NUL character")
 			goto redo
 		}
 		if b == '\n' {
 			s.line++
 			s.col = colbase
 		}
 		return rune(b)
 	}

 	// EOF
 	if s.r == s.w {
 		if s.ioerr != io.EOF {
 			// ensure we never start with a '/' (e.g., rooted path) in the error message
 			s.error("I/O error: " + s.ioerr.Error())
 		}
 		return -1
 	}

 	// uncommon case: not ASCII
 	r, w := utf8.DecodeRune(s.buf[s.r:s.w])
 	s.r += w
 	s.col += uint(w)

 	if r == utf8.RuneError && w == 1 {
 		s.error("invalid UTF-8 encoding")
 		goto redo
 	}

 	// BOM's are only allowed as the first character in a file
 	const BOM = 0xfeff
 	if r == BOM {
 		if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1)
 			s.error("invalid BOM in the middle of the file")
 		}
 		goto redo
 	}

 	return r
 }

 func (s *source) fill() {
 	// Slide unread bytes to beginning but preserve last read char
 	// (for one ungetr call) plus one extra byte (for a 2nd ungetr
 	// call, only for ".." character sequence and float literals
 	// starting with ".").
 	if s.r0 > 1 {
 		// save literal prefix, if any
 		// (We see at most one ungetr call while reading
 		// a literal, so make sure s.r0 remains in buf.)
 		if s.suf >= 0 {
 			s.lit = append(s.lit, s.buf[s.suf:s.r0]...)
 			s.suf = 1 // == s.r0 after slide below
 		}
 		n := s.r0 - 1
 		copy(s.buf[:], s.buf[n:s.w])
 		s.offs += n
 		s.r0 = 1 // eqv: s.r0 -= n
 		s.r -= n
 		s.w -= n
 	}

 	// read more data: try a limited number of times
 	for i := 100; i > 0; i-- {
 		n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
 		if n < 0 {
 			panic("negative read") // incorrect underlying io.Reader implementation
 		}
 		s.w += n
 		if n > 0 || err != nil {
 			s.buf[s.w] = utf8.RuneSelf // sentinel
 			if err != nil {
 				s.ioerr = err
 			}
 			return
 		}
 	}

 	s.ioerr = io.ErrNoProgress
 }

 func (s *source) startLit() {
 	s.suf = s.r0
 	s.lit = s.lit[:0] // reuse lit
 }

 func (s *source) stopLit() []byte {
 	lit := s.buf[s.suf:s.r]
 	if len(s.lit) > 0 {
 		lit = append(s.lit, lit...)
 	}
 	s.killLit()
 	return lit
 }

 func (s *source) killLit() {
 	s.suf = -1 // no pending literal
 }
	// Copyright 2016 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// This file implements source, a buffered rune reader
	// which is specialized for the needs of the Go scanner:
	// Contiguous sequences of runes (literals) are extracted
	// directly as []byte without the need to re-encode the
	// runes in UTF-8 (as would be necessary with bufio.Reader).
	//
	// This file is self-contained (go tool compile source.go
	// compiles) and thus could be made into its own package.

	package syntax

	import (
	"io"
	"unicode/utf8"
	)

	// starting points for line and column numbers
	const linebase = 1
	const colbase = 1

	// buf [...read...\|...\|...unread...\|s\|...free...]
	// ^ ^ ^ ^
	// \| \| \| \|
	// suf r0 r w

	type source struct {
	src io.Reader
	errh func(line, pos uint, msg string)

	// source buffer
	buf [4 << 10]byte
	offs int // source offset of buf
	r0, r, w int // previous/current read and write buf positions, excluding sentinel
	line0, line uint // previous/current line
	col0, col uint // previous/current column (byte offsets from line start)
	ioerr error // pending io error

	// literal buffer
	lit []byte // literal prefix
	suf int // literal suffix; suf >= 0 means we are scanning a literal
	}

	// init initializes source to read from src and to report errors via errh.
	// errh must not be nil.
	func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) {
	s.src = src
	s.errh = errh

	s.buf[0] = utf8.RuneSelf // terminate with sentinel
	s.offs = 0
	s.r0, s.r, s.w = 0, 0, 0
	s.line0, s.line = 0, linebase
	s.col0, s.col = 0, colbase
	s.ioerr = nil

	s.lit = s.lit[:0]
	s.suf = -1
	}

	// ungetr ungets the most recently read rune.
	func (s *source) ungetr() {
	s.r, s.line, s.col = s.r0, s.line0, s.col0
	}

	// ungetr2 is like ungetr but enables a 2nd ungetr.
	// It must not be called if one of the runes seen
	// was a newline.
	func (s *source) ungetr2() {
	s.ungetr()
	// line must not have changed
	s.r0--
	s.col0--
	}

	func (s *source) error(msg string) {
	s.errh(s.line0, s.col0, msg)
	}

	// getr reads and returns the next rune.
	//
	// If a read or source encoding error occurs, getr
	// calls the error handler installed with init.
	// The handler must exist.
	//
	// The (line, col) position passed to the error handler
	// is always at the current source reading position.
	func (s *source) getr() rune {
	redo:
	s.r0, s.line0, s.col0 = s.r, s.line, s.col

	// We could avoid at least one test that is always taken in the
	// for loop below by duplicating the common case code (ASCII)
	// here since we always have at least the sentinel (utf8.RuneSelf)
	// in the buffer. Measure and optimize if necessary.

	// make sure we have at least one rune in buffer, or we are at EOF
	for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) {
	s.fill() // s.w-s.r < len(s.buf) => buffer is not full
	}

	// common case: ASCII and enough bytes
	// (invariant: s.buf[s.w] == utf8.RuneSelf)
	if b := s.buf[s.r]; b < utf8.RuneSelf {
	s.r++
	// TODO(gri) Optimization: Instead of adjusting s.col for each character,
	// remember the line offset instead and then compute the offset as needed
	// (which is less often).
	s.col++
	if b == 0 {
	s.error("invalid NUL character")
	goto redo
	}
	if b == '\n' {
	s.line++
	s.col = colbase
	}
	return rune(b)
	}

	// EOF
	if s.r == s.w {
	if s.ioerr != io.EOF {
	// ensure we never start with a '/' (e.g., rooted path) in the error message
	s.error("I/O error: " + s.ioerr.Error())
	}
	return -1
	}

	// uncommon case: not ASCII
	r, w := utf8.DecodeRune(s.buf[s.r:s.w])
	s.r += w
	s.col += uint(w)

	if r == utf8.RuneError && w == 1 {
	s.error("invalid UTF-8 encoding")
	goto redo
	}

	// BOM's are only allowed as the first character in a file
	const BOM = 0xfeff
	if r == BOM {
	if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1)
	s.error("invalid BOM in the middle of the file")
	}
	goto redo
	}

	return r
	}

	func (s *source) fill() {
	// Slide unread bytes to beginning but preserve last read char
	// (for one ungetr call) plus one extra byte (for a 2nd ungetr
	// call, only for ".." character sequence and float literals
	// starting with ".").
	if s.r0 > 1 {
	// save literal prefix, if any
	// (We see at most one ungetr call while reading
	// a literal, so make sure s.r0 remains in buf.)
	if s.suf >= 0 {
	s.lit = append(s.lit, s.buf[s.suf:s.r0]...)
	s.suf = 1 // == s.r0 after slide below
	}
	n := s.r0 - 1
	copy(s.buf[:], s.buf[n:s.w])
	s.offs += n
	s.r0 = 1 // eqv: s.r0 -= n
	s.r -= n
	s.w -= n
	}

	// read more data: try a limited number of times
	for i := 100; i > 0; i-- {
	n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
	if n < 0 {
	panic("negative read") // incorrect underlying io.Reader implementation
	}
	s.w += n
	if n > 0 \|\| err != nil {
	s.buf[s.w] = utf8.RuneSelf // sentinel
	if err != nil {
	s.ioerr = err
	}
	return
	}
	}

	s.ioerr = io.ErrNoProgress
	}

	func (s *source) startLit() {
	s.suf = s.r0
	s.lit = s.lit[:0] // reuse lit
	}

	func (s *source) stopLit() []byte {
	lit := s.buf[s.suf:s.r]
	if len(s.lit) > 0 {
	lit = append(s.lit, lit...)
	}
	s.killLit()
	return lit
	}

	func (s *source) killLit() {
	s.suf = -1 // no pending literal
	}