| // Copyright 2013 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package bufio |
| |
| import ( |
| "bytes" |
| "errors" |
| "io" |
| "unicode/utf8" |
| ) |
| |
| // Scanner provides a convenient interface for reading data such as |
| // a file of newline-delimited lines of text. Successive calls to |
| // the Scan method will step through the 'tokens' of a file, skipping |
| // the bytes between the tokens. The specification of a token is |
| // defined by a split function of type SplitFunc; the default split |
| // function breaks the input into lines with line termination stripped. Split |
| // functions are defined in this package for scanning a file into |
| // lines, bytes, UTF-8-encoded runes, and space-delimited words. The |
| // client may instead provide a custom split function. |
| // |
| // Scanning stops unrecoverably at EOF, the first I/O error, or a token too |
| // large to fit in the buffer. When a scan stops, the reader may have |
| // advanced arbitrarily far past the last token. Programs that need more |
| // control over error handling or large tokens, or must run sequential scans |
| // on a reader, should use bufio.Reader instead. |
| // |
| type Scanner struct { |
| r io.Reader // The reader provided by the client. |
| split SplitFunc // The function to split the tokens. |
| maxTokenSize int // Maximum size of a token; modified by tests. |
| token []byte // Last token returned by split. |
| buf []byte // Buffer used as argument to split. |
| start int // First non-processed byte in buf. |
| end int // End of data in buf. |
| err error // Sticky error. |
| empties int // Count of successive empty tokens. |
| scanCalled bool // Scan has been called; buffer is in use. |
| done bool // Scan has finished. |
| } |
| |
| // SplitFunc is the signature of the split function used to tokenize the |
| // input. The arguments are an initial substring of the remaining unprocessed |
| // data and a flag, atEOF, that reports whether the Reader has no more data |
| // to give. The return values are the number of bytes to advance the input |
| // and the next token to return to the user, if any, plus an error, if any. |
| // |
| // Scanning stops if the function returns an error, in which case some of |
| // the input may be discarded. |
| // |
| // Otherwise, the Scanner advances the input. If the token is not nil, |
| // the Scanner returns it to the user. If the token is nil, the |
| // Scanner reads more data and continues scanning; if there is no more |
| // data--if atEOF was true--the Scanner returns. If the data does not |
| // yet hold a complete token, for instance if it has no newline while |
| // scanning lines, a SplitFunc can return (0, nil, nil) to signal the |
| // Scanner to read more data into the slice and try again with a |
| // longer slice starting at the same point in the input. |
| // |
| // The function is never called with an empty data slice unless atEOF |
| // is true. If atEOF is true, however, data may be non-empty and, |
| // as always, holds unprocessed text. |
| type SplitFunc func(data []byte, atEOF bool) (advance int, token []byte, err error) |
| |
| // Errors returned by Scanner. |
| var ( |
| ErrTooLong = errors.New("bufio.Scanner: token too long") |
| ErrNegativeAdvance = errors.New("bufio.Scanner: SplitFunc returns negative advance count") |
| ErrAdvanceTooFar = errors.New("bufio.Scanner: SplitFunc returns advance count beyond input") |
| ) |
| |
| const ( |
| // MaxScanTokenSize is the maximum size used to buffer a token |
| // unless the user provides an explicit buffer with Scanner.Buffer. |
| // The actual maximum token size may be smaller as the buffer |
| // may need to include, for instance, a newline. |
| MaxScanTokenSize = 64 * 1024 |
| |
| startBufSize = 4096 // Size of initial allocation for buffer. |
| ) |
| |
| // NewScanner returns a new Scanner to read from r. |
| // The split function defaults to ScanLines. |
| func NewScanner(r io.Reader) *Scanner { |
| return &Scanner{ |
| r: r, |
| split: ScanLines, |
| maxTokenSize: MaxScanTokenSize, |
| } |
| } |
| |
| // Err returns the first non-EOF error that was encountered by the Scanner. |
| func (s *Scanner) Err() error { |
| if s.err == io.EOF { |
| return nil |
| } |
| return s.err |
| } |
| |
| // Bytes returns the most recent token generated by a call to Scan. |
| // The underlying array may point to data that will be overwritten |
| // by a subsequent call to Scan. It does no allocation. |
| func (s *Scanner) Bytes() []byte { |
| return s.token |
| } |
| |
| // Text returns the most recent token generated by a call to Scan |
| // as a newly allocated string holding its bytes. |
| func (s *Scanner) Text() string { |
| return string(s.token) |
| } |
| |
| // ErrFinalToken is a special sentinel error value. It is intended to be |
| // returned by a Split function to indicate that the token being delivered |
| // with the error is the last token and scanning should stop after this one. |
| // After ErrFinalToken is received by Scan, scanning stops with no error. |
| // The value is useful to stop processing early or when it is necessary to |
| // deliver a final empty token. One could achieve the same behavior |
| // with a custom error value but providing one here is tidier. |
| // See the emptyFinalToken example for a use of this value. |
| var ErrFinalToken = errors.New("final token") |
| |
| // Scan advances the Scanner to the next token, which will then be |
| // available through the Bytes or Text method. It returns false when the |
| // scan stops, either by reaching the end of the input or an error. |
| // After Scan returns false, the Err method will return any error that |
| // occurred during scanning, except that if it was io.EOF, Err |
| // will return nil. |
| // Scan panics if the split function returns too many empty |
| // tokens without advancing the input. This is a common error mode for |
| // scanners. |
| func (s *Scanner) Scan() bool { |
| if s.done { |
| return false |
| } |
| s.scanCalled = true |
| // Loop until we have a token. |
| for { |
| // See if we can get a token with what we already have. |
| // If we've run out of data but have an error, give the split function |
| // a chance to recover any remaining, possibly empty token. |
| if s.end > s.start || s.err != nil { |
| advance, token, err := s.split(s.buf[s.start:s.end], s.err != nil) |
| if err != nil { |
| if err == ErrFinalToken { |
| s.token = token |
| s.done = true |
| return true |
| } |
| s.setErr(err) |
| return false |
| } |
| if !s.advance(advance) { |
| return false |
| } |
| s.token = token |
| if token != nil { |
| if s.err == nil || advance > 0 { |
| s.empties = 0 |
| } else { |
| // Returning tokens not advancing input at EOF. |
| s.empties++ |
| if s.empties > maxConsecutiveEmptyReads { |
| panic("bufio.Scan: too many empty tokens without progressing") |
| } |
| } |
| return true |
| } |
| } |
| // We cannot generate a token with what we are holding. |
| // If we've already hit EOF or an I/O error, we are done. |
| if s.err != nil { |
| // Shut it down. |
| s.start = 0 |
| s.end = 0 |
| return false |
| } |
| // Must read more data. |
| // First, shift data to beginning of buffer if there's lots of empty space |
| // or space is needed. |
| if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) { |
| copy(s.buf, s.buf[s.start:s.end]) |
| s.end -= s.start |
| s.start = 0 |
| } |
| // Is the buffer full? If so, resize. |
| if s.end == len(s.buf) { |
| // Guarantee no overflow in the multiplication below. |
| const maxInt = int(^uint(0) >> 1) |
| if len(s.buf) >= s.maxTokenSize || len(s.buf) > maxInt/2 { |
| s.setErr(ErrTooLong) |
| return false |
| } |
| newSize := len(s.buf) * 2 |
| if newSize == 0 { |
| newSize = startBufSize |
| } |
| if newSize > s.maxTokenSize { |
| newSize = s.maxTokenSize |
| } |
| newBuf := make([]byte, newSize) |
| copy(newBuf, s.buf[s.start:s.end]) |
| s.buf = newBuf |
| s.end -= s.start |
| s.start = 0 |
| } |
| // Finally we can read some input. Make sure we don't get stuck with |
| // a misbehaving Reader. Officially we don't need to do this, but let's |
| // be extra careful: Scanner is for safe, simple jobs. |
| for loop := 0; ; { |
| n, err := s.r.Read(s.buf[s.end:len(s.buf)]) |
| s.end += n |
| if err != nil { |
| s.setErr(err) |
| break |
| } |
| if n > 0 { |
| s.empties = 0 |
| break |
| } |
| loop++ |
| if loop > maxConsecutiveEmptyReads { |
| s.setErr(io.ErrNoProgress) |
| break |
| } |
| } |
| } |
| } |
| |
| // advance consumes n bytes of the buffer. It reports whether the advance was legal. |
| func (s *Scanner) advance(n int) bool { |
| if n < 0 { |
| s.setErr(ErrNegativeAdvance) |
| return false |
| } |
| if n > s.end-s.start { |
| s.setErr(ErrAdvanceTooFar) |
| return false |
| } |
| s.start += n |
| return true |
| } |
| |
| // setErr records the first error encountered. |
| func (s *Scanner) setErr(err error) { |
| if s.err == nil || s.err == io.EOF { |
| s.err = err |
| } |
| } |
| |
| // Buffer sets the initial buffer to use when scanning and the maximum |
| // size of buffer that may be allocated during scanning. The maximum |
| // token size is the larger of max and cap(buf). If max <= cap(buf), |
| // Scan will use this buffer only and do no allocation. |
| // |
| // By default, Scan uses an internal buffer and sets the |
| // maximum token size to MaxScanTokenSize. |
| // |
| // Buffer panics if it is called after scanning has started. |
| func (s *Scanner) Buffer(buf []byte, max int) { |
| if s.scanCalled { |
| panic("Buffer called after Scan") |
| } |
| s.buf = buf[0:cap(buf)] |
| s.maxTokenSize = max |
| } |
| |
| // Split sets the split function for the Scanner. |
| // The default split function is ScanLines. |
| // |
| // Split panics if it is called after scanning has started. |
| func (s *Scanner) Split(split SplitFunc) { |
| if s.scanCalled { |
| panic("Split called after Scan") |
| } |
| s.split = split |
| } |
| |
| // Split functions |
| |
| // ScanBytes is a split function for a Scanner that returns each byte as a token. |
| func ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) { |
| if atEOF && len(data) == 0 { |
| return 0, nil, nil |
| } |
| return 1, data[0:1], nil |
| } |
| |
| var errorRune = []byte(string(utf8.RuneError)) |
| |
| // ScanRunes is a split function for a Scanner that returns each |
| // UTF-8-encoded rune as a token. The sequence of runes returned is |
| // equivalent to that from a range loop over the input as a string, which |
| // means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd". |
| // Because of the Scan interface, this makes it impossible for the client to |
| // distinguish correctly encoded replacement runes from encoding errors. |
| func ScanRunes(data []byte, atEOF bool) (advance int, token []byte, err error) { |
| if atEOF && len(data) == 0 { |
| return 0, nil, nil |
| } |
| |
| // Fast path 1: ASCII. |
| if data[0] < utf8.RuneSelf { |
| return 1, data[0:1], nil |
| } |
| |
| // Fast path 2: Correct UTF-8 decode without error. |
| _, width := utf8.DecodeRune(data) |
| if width > 1 { |
| // It's a valid encoding. Width cannot be one for a correctly encoded |
| // non-ASCII rune. |
| return width, data[0:width], nil |
| } |
| |
| // We know it's an error: we have width==1 and implicitly r==utf8.RuneError. |
| // Is the error because there wasn't a full rune to be decoded? |
| // FullRune distinguishes correctly between erroneous and incomplete encodings. |
| if !atEOF && !utf8.FullRune(data) { |
| // Incomplete; get more bytes. |
| return 0, nil, nil |
| } |
| |
| // We have a real UTF-8 encoding error. Return a properly encoded error rune |
| // but advance only one byte. This matches the behavior of a range loop over |
| // an incorrectly encoded string. |
| return 1, errorRune, nil |
| } |
| |
| // dropCR drops a terminal \r from the data. |
| func dropCR(data []byte) []byte { |
| if len(data) > 0 && data[len(data)-1] == '\r' { |
| return data[0 : len(data)-1] |
| } |
| return data |
| } |
| |
| // ScanLines is a split function for a Scanner that returns each line of |
| // text, stripped of any trailing end-of-line marker. The returned line may |
| // be empty. The end-of-line marker is one optional carriage return followed |
| // by one mandatory newline. In regular expression notation, it is `\r?\n`. |
| // The last non-empty line of input will be returned even if it has no |
| // newline. |
| func ScanLines(data []byte, atEOF bool) (advance int, token []byte, err error) { |
| if atEOF && len(data) == 0 { |
| return 0, nil, nil |
| } |
| if i := bytes.IndexByte(data, '\n'); i >= 0 { |
| // We have a full newline-terminated line. |
| return i + 1, dropCR(data[0:i]), nil |
| } |
| // If we're at EOF, we have a final, non-terminated line. Return it. |
| if atEOF { |
| return len(data), dropCR(data), nil |
| } |
| // Request more data. |
| return 0, nil, nil |
| } |
| |
| // isSpace reports whether the character is a Unicode white space character. |
| // We avoid dependency on the unicode package, but check validity of the implementation |
| // in the tests. |
| func isSpace(r rune) bool { |
| if r <= '\u00FF' { |
| // Obvious ASCII ones: \t through \r plus space. Plus two Latin-1 oddballs. |
| switch r { |
| case ' ', '\t', '\n', '\v', '\f', '\r': |
| return true |
| case '\u0085', '\u00A0': |
| return true |
| } |
| return false |
| } |
| // High-valued ones. |
| if '\u2000' <= r && r <= '\u200a' { |
| return true |
| } |
| switch r { |
| case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000': |
| return true |
| } |
| return false |
| } |
| |
| // ScanWords is a split function for a Scanner that returns each |
| // space-separated word of text, with surrounding spaces deleted. It will |
| // never return an empty string. The definition of space is set by |
| // unicode.IsSpace. |
| func ScanWords(data []byte, atEOF bool) (advance int, token []byte, err error) { |
| // Skip leading spaces. |
| start := 0 |
| for width := 0; start < len(data); start += width { |
| var r rune |
| r, width = utf8.DecodeRune(data[start:]) |
| if !isSpace(r) { |
| break |
| } |
| } |
| // Scan until space, marking end of word. |
| for width, i := 0, start; i < len(data); i += width { |
| var r rune |
| r, width = utf8.DecodeRune(data[i:]) |
| if isSpace(r) { |
| return i + width, data[start:i], nil |
| } |
| } |
| // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. |
| if atEOF && len(data) > start { |
| return len(data), data[start:], nil |
| } |
| // Request more data. |
| return start, nil, nil |
| } |