|  | // Copyright 2013 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | package bufio | 
|  |  | 
|  | import ( | 
|  | "bytes" | 
|  | "errors" | 
|  | "io" | 
|  | "unicode/utf8" | 
|  | ) | 
|  |  | 
|  | // Scanner provides a convenient interface for reading data such as | 
|  | // a file of newline-delimited lines of text. Successive calls to | 
|  | // the Scan method will step through the 'tokens' of a file, skipping | 
|  | // the bytes between the tokens. The specification of a token is | 
|  | // defined by a split function of type SplitFunc; the default split | 
|  | // function breaks the input into lines with line termination stripped. Split | 
|  | // functions are defined in this package for scanning a file into | 
|  | // lines, bytes, UTF-8-encoded runes, and space-delimited words. The | 
|  | // client may instead provide a custom split function. | 
|  | // | 
|  | // Scanning stops unrecoverably at EOF, the first I/O error, or a token too | 
|  | // large to fit in the buffer. When a scan stops, the reader may have | 
|  | // advanced arbitrarily far past the last token. Programs that need more | 
|  | // control over error handling or large tokens, or must run sequential scans | 
|  | // on a reader, should use bufio.Reader instead. | 
|  | // | 
|  | type Scanner struct { | 
|  | r            io.Reader // The reader provided by the client. | 
|  | split        SplitFunc // The function to split the tokens. | 
|  | maxTokenSize int       // Maximum size of a token; modified by tests. | 
|  | token        []byte    // Last token returned by split. | 
|  | buf          []byte    // Buffer used as argument to split. | 
|  | start        int       // First non-processed byte in buf. | 
|  | end          int       // End of data in buf. | 
|  | err          error     // Sticky error. | 
|  | empties      int       // Count of successive empty tokens. | 
|  | scanCalled   bool      // Scan has been called; buffer is in use. | 
|  | done         bool      // Scan has finished. | 
|  | } | 
|  |  | 
|  | // SplitFunc is the signature of the split function used to tokenize the | 
|  | // input. The arguments are an initial substring of the remaining unprocessed | 
|  | // data and a flag, atEOF, that reports whether the Reader has no more data | 
|  | // to give. The return values are the number of bytes to advance the input | 
|  | // and the next token to return to the user, plus an error, if any. If the | 
|  | // data does not yet hold a complete token, for instance if it has no newline | 
|  | // while scanning lines, SplitFunc can return (0, nil, nil) to signal the | 
|  | // Scanner to read more data into the slice and try again with a longer slice | 
|  | // starting at the same point in the input. | 
|  | // | 
|  | // If the returned error is non-nil, scanning stops and the error | 
|  | // is returned to the client. | 
|  | // | 
|  | // The function is never called with an empty data slice unless atEOF | 
|  | // is true. If atEOF is true, however, data may be non-empty and, | 
|  | // as always, holds unprocessed text. | 
|  | type SplitFunc func(data []byte, atEOF bool) (advance int, token []byte, err error) | 
|  |  | 
|  | // Errors returned by Scanner. | 
|  | var ( | 
|  | ErrTooLong         = errors.New("bufio.Scanner: token too long") | 
|  | ErrNegativeAdvance = errors.New("bufio.Scanner: SplitFunc returns negative advance count") | 
|  | ErrAdvanceTooFar   = errors.New("bufio.Scanner: SplitFunc returns advance count beyond input") | 
|  | ) | 
|  |  | 
|  | const ( | 
|  | // MaxScanTokenSize is the maximum size used to buffer a token | 
|  | // unless the user provides an explicit buffer with Scan.Buffer. | 
|  | // The actual maximum token size may be smaller as the buffer | 
|  | // may need to include, for instance, a newline. | 
|  | MaxScanTokenSize = 64 * 1024 | 
|  |  | 
|  | startBufSize = 4096 // Size of initial allocation for buffer. | 
|  | ) | 
|  |  | 
|  | // NewScanner returns a new Scanner to read from r. | 
|  | // The split function defaults to ScanLines. | 
|  | func NewScanner(r io.Reader) *Scanner { | 
|  | return &Scanner{ | 
|  | r:            r, | 
|  | split:        ScanLines, | 
|  | maxTokenSize: MaxScanTokenSize, | 
|  | } | 
|  | } | 
|  |  | 
|  | // Err returns the first non-EOF error that was encountered by the Scanner. | 
|  | func (s *Scanner) Err() error { | 
|  | if s.err == io.EOF { | 
|  | return nil | 
|  | } | 
|  | return s.err | 
|  | } | 
|  |  | 
|  | // Bytes returns the most recent token generated by a call to Scan. | 
|  | // The underlying array may point to data that will be overwritten | 
|  | // by a subsequent call to Scan. It does no allocation. | 
|  | func (s *Scanner) Bytes() []byte { | 
|  | return s.token | 
|  | } | 
|  |  | 
|  | // Text returns the most recent token generated by a call to Scan | 
|  | // as a newly allocated string holding its bytes. | 
|  | func (s *Scanner) Text() string { | 
|  | return string(s.token) | 
|  | } | 
|  |  | 
|  | // ErrFinalToken is a special sentinel error value. It is intended to be | 
|  | // returned by a Split function to indicate that the token being delivered | 
|  | // with the error is the last token and scanning should stop after this one. | 
|  | // After ErrFinalToken is received by Scan, scanning stops with no error. | 
|  | // The value is useful to stop processing early or when it is necessary to | 
|  | // deliver a final empty token. One could achieve the same behavior | 
|  | // with a custom error value but providing one here is tidier. | 
|  | // See the emptyFinalToken example for a use of this value. | 
|  | var ErrFinalToken = errors.New("final token") | 
|  |  | 
|  | // Scan advances the Scanner to the next token, which will then be | 
|  | // available through the Bytes or Text method. It returns false when the | 
|  | // scan stops, either by reaching the end of the input or an error. | 
|  | // After Scan returns false, the Err method will return any error that | 
|  | // occurred during scanning, except that if it was io.EOF, Err | 
|  | // will return nil. | 
|  | // Scan panics if the split function returns 100 empty tokens without | 
|  | // advancing the input. This is a common error mode for scanners. | 
|  | func (s *Scanner) Scan() bool { | 
|  | if s.done { | 
|  | return false | 
|  | } | 
|  | s.scanCalled = true | 
|  | // Loop until we have a token. | 
|  | for { | 
|  | // See if we can get a token with what we already have. | 
|  | // If we've run out of data but have an error, give the split function | 
|  | // a chance to recover any remaining, possibly empty token. | 
|  | if s.end > s.start || s.err != nil { | 
|  | advance, token, err := s.split(s.buf[s.start:s.end], s.err != nil) | 
|  | if err != nil { | 
|  | if err == ErrFinalToken { | 
|  | s.token = token | 
|  | s.done = true | 
|  | return true | 
|  | } | 
|  | s.setErr(err) | 
|  | return false | 
|  | } | 
|  | if !s.advance(advance) { | 
|  | return false | 
|  | } | 
|  | s.token = token | 
|  | if token != nil { | 
|  | if s.err == nil || advance > 0 { | 
|  | s.empties = 0 | 
|  | } else { | 
|  | // Returning tokens not advancing input at EOF. | 
|  | s.empties++ | 
|  | if s.empties > 100 { | 
|  | panic("bufio.Scan: 100 empty tokens without progressing") | 
|  | } | 
|  | } | 
|  | return true | 
|  | } | 
|  | } | 
|  | // We cannot generate a token with what we are holding. | 
|  | // If we've already hit EOF or an I/O error, we are done. | 
|  | if s.err != nil { | 
|  | // Shut it down. | 
|  | s.start = 0 | 
|  | s.end = 0 | 
|  | return false | 
|  | } | 
|  | // Must read more data. | 
|  | // First, shift data to beginning of buffer if there's lots of empty space | 
|  | // or space is needed. | 
|  | if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) { | 
|  | copy(s.buf, s.buf[s.start:s.end]) | 
|  | s.end -= s.start | 
|  | s.start = 0 | 
|  | } | 
|  | // Is the buffer full? If so, resize. | 
|  | if s.end == len(s.buf) { | 
|  | // Guarantee no overflow in the multiplication below. | 
|  | const maxInt = int(^uint(0) >> 1) | 
|  | if len(s.buf) >= s.maxTokenSize || len(s.buf) > maxInt/2 { | 
|  | s.setErr(ErrTooLong) | 
|  | return false | 
|  | } | 
|  | newSize := len(s.buf) * 2 | 
|  | if newSize == 0 { | 
|  | newSize = startBufSize | 
|  | } | 
|  | if newSize > s.maxTokenSize { | 
|  | newSize = s.maxTokenSize | 
|  | } | 
|  | newBuf := make([]byte, newSize) | 
|  | copy(newBuf, s.buf[s.start:s.end]) | 
|  | s.buf = newBuf | 
|  | s.end -= s.start | 
|  | s.start = 0 | 
|  | } | 
|  | // Finally we can read some input. Make sure we don't get stuck with | 
|  | // a misbehaving Reader. Officially we don't need to do this, but let's | 
|  | // be extra careful: Scanner is for safe, simple jobs. | 
|  | for loop := 0; ; { | 
|  | n, err := s.r.Read(s.buf[s.end:len(s.buf)]) | 
|  | s.end += n | 
|  | if err != nil { | 
|  | s.setErr(err) | 
|  | break | 
|  | } | 
|  | if n > 0 { | 
|  | s.empties = 0 | 
|  | break | 
|  | } | 
|  | loop++ | 
|  | if loop > maxConsecutiveEmptyReads { | 
|  | s.setErr(io.ErrNoProgress) | 
|  | break | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // advance consumes n bytes of the buffer. It reports whether the advance was legal. | 
|  | func (s *Scanner) advance(n int) bool { | 
|  | if n < 0 { | 
|  | s.setErr(ErrNegativeAdvance) | 
|  | return false | 
|  | } | 
|  | if n > s.end-s.start { | 
|  | s.setErr(ErrAdvanceTooFar) | 
|  | return false | 
|  | } | 
|  | s.start += n | 
|  | return true | 
|  | } | 
|  |  | 
|  | // setErr records the first error encountered. | 
|  | func (s *Scanner) setErr(err error) { | 
|  | if s.err == nil || s.err == io.EOF { | 
|  | s.err = err | 
|  | } | 
|  | } | 
|  |  | 
|  | // Buffer sets the initial buffer to use when scanning and the maximum | 
|  | // size of buffer that may be allocated during scanning. The maximum | 
|  | // token size is the larger of max and cap(buf). If max <= cap(buf), | 
|  | // Scan will use this buffer only and do no allocation. | 
|  | // | 
|  | // By default, Scan uses an internal buffer and sets the | 
|  | // maximum token size to MaxScanTokenSize. | 
|  | // | 
|  | // Buffer panics if it is called after scanning has started. | 
|  | func (s *Scanner) Buffer(buf []byte, max int) { | 
|  | if s.scanCalled { | 
|  | panic("Buffer called after Scan") | 
|  | } | 
|  | s.buf = buf[0:cap(buf)] | 
|  | s.maxTokenSize = max | 
|  | } | 
|  |  | 
|  | // Split sets the split function for the Scanner. | 
|  | // The default split function is ScanLines. | 
|  | // | 
|  | // Split panics if it is called after scanning has started. | 
|  | func (s *Scanner) Split(split SplitFunc) { | 
|  | if s.scanCalled { | 
|  | panic("Split called after Scan") | 
|  | } | 
|  | s.split = split | 
|  | } | 
|  |  | 
|  | // Split functions | 
|  |  | 
|  | // ScanBytes is a split function for a Scanner that returns each byte as a token. | 
|  | func ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) { | 
|  | if atEOF && len(data) == 0 { | 
|  | return 0, nil, nil | 
|  | } | 
|  | return 1, data[0:1], nil | 
|  | } | 
|  |  | 
|  | var errorRune = []byte(string(utf8.RuneError)) | 
|  |  | 
|  | // ScanRunes is a split function for a Scanner that returns each | 
|  | // UTF-8-encoded rune as a token. The sequence of runes returned is | 
|  | // equivalent to that from a range loop over the input as a string, which | 
|  | // means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd". | 
|  | // Because of the Scan interface, this makes it impossible for the client to | 
|  | // distinguish correctly encoded replacement runes from encoding errors. | 
|  | func ScanRunes(data []byte, atEOF bool) (advance int, token []byte, err error) { | 
|  | if atEOF && len(data) == 0 { | 
|  | return 0, nil, nil | 
|  | } | 
|  |  | 
|  | // Fast path 1: ASCII. | 
|  | if data[0] < utf8.RuneSelf { | 
|  | return 1, data[0:1], nil | 
|  | } | 
|  |  | 
|  | // Fast path 2: Correct UTF-8 decode without error. | 
|  | _, width := utf8.DecodeRune(data) | 
|  | if width > 1 { | 
|  | // It's a valid encoding. Width cannot be one for a correctly encoded | 
|  | // non-ASCII rune. | 
|  | return width, data[0:width], nil | 
|  | } | 
|  |  | 
|  | // We know it's an error: we have width==1 and implicitly r==utf8.RuneError. | 
|  | // Is the error because there wasn't a full rune to be decoded? | 
|  | // FullRune distinguishes correctly between erroneous and incomplete encodings. | 
|  | if !atEOF && !utf8.FullRune(data) { | 
|  | // Incomplete; get more bytes. | 
|  | return 0, nil, nil | 
|  | } | 
|  |  | 
|  | // We have a real UTF-8 encoding error. Return a properly encoded error rune | 
|  | // but advance only one byte. This matches the behavior of a range loop over | 
|  | // an incorrectly encoded string. | 
|  | return 1, errorRune, nil | 
|  | } | 
|  |  | 
|  | // dropCR drops a terminal \r from the data. | 
|  | func dropCR(data []byte) []byte { | 
|  | if len(data) > 0 && data[len(data)-1] == '\r' { | 
|  | return data[0 : len(data)-1] | 
|  | } | 
|  | return data | 
|  | } | 
|  |  | 
|  | // ScanLines is a split function for a Scanner that returns each line of | 
|  | // text, stripped of any trailing end-of-line marker. The returned line may | 
|  | // be empty. The end-of-line marker is one optional carriage return followed | 
|  | // by one mandatory newline. In regular expression notation, it is `\r?\n`. | 
|  | // The last non-empty line of input will be returned even if it has no | 
|  | // newline. | 
|  | func ScanLines(data []byte, atEOF bool) (advance int, token []byte, err error) { | 
|  | if atEOF && len(data) == 0 { | 
|  | return 0, nil, nil | 
|  | } | 
|  | if i := bytes.IndexByte(data, '\n'); i >= 0 { | 
|  | // We have a full newline-terminated line. | 
|  | return i + 1, dropCR(data[0:i]), nil | 
|  | } | 
|  | // If we're at EOF, we have a final, non-terminated line. Return it. | 
|  | if atEOF { | 
|  | return len(data), dropCR(data), nil | 
|  | } | 
|  | // Request more data. | 
|  | return 0, nil, nil | 
|  | } | 
|  |  | 
|  | // isSpace reports whether the character is a Unicode white space character. | 
|  | // We avoid dependency on the unicode package, but check validity of the implementation | 
|  | // in the tests. | 
|  | func isSpace(r rune) bool { | 
|  | if r <= '\u00FF' { | 
|  | // Obvious ASCII ones: \t through \r plus space. Plus two Latin-1 oddballs. | 
|  | switch r { | 
|  | case ' ', '\t', '\n', '\v', '\f', '\r': | 
|  | return true | 
|  | case '\u0085', '\u00A0': | 
|  | return true | 
|  | } | 
|  | return false | 
|  | } | 
|  | // High-valued ones. | 
|  | if '\u2000' <= r && r <= '\u200a' { | 
|  | return true | 
|  | } | 
|  | switch r { | 
|  | case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000': | 
|  | return true | 
|  | } | 
|  | return false | 
|  | } | 
|  |  | 
|  | // ScanWords is a split function for a Scanner that returns each | 
|  | // space-separated word of text, with surrounding spaces deleted. It will | 
|  | // never return an empty string. The definition of space is set by | 
|  | // unicode.IsSpace. | 
|  | func ScanWords(data []byte, atEOF bool) (advance int, token []byte, err error) { | 
|  | // Skip leading spaces. | 
|  | start := 0 | 
|  | for width := 0; start < len(data); start += width { | 
|  | var r rune | 
|  | r, width = utf8.DecodeRune(data[start:]) | 
|  | if !isSpace(r) { | 
|  | break | 
|  | } | 
|  | } | 
|  | // Scan until space, marking end of word. | 
|  | for width, i := 0, start; i < len(data); i += width { | 
|  | var r rune | 
|  | r, width = utf8.DecodeRune(data[i:]) | 
|  | if isSpace(r) { | 
|  | return i + width, data[start:i], nil | 
|  | } | 
|  | } | 
|  | // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. | 
|  | if atEOF && len(data) > start { | 
|  | return len(data), data[start:], nil | 
|  | } | 
|  | // Request more data. | 
|  | return start, nil, nil | 
|  | } |