Source File
scan.go
Belonging Package
bufio
// Copyright 2013 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package bufioimport ()// Scanner provides a convenient interface for reading data such as// a file of newline-delimited lines of text. Successive calls to// the Scan method will step through the 'tokens' of a file, skipping// the bytes between the tokens. The specification of a token is// defined by a split function of type SplitFunc; the default split// function breaks the input into lines with line termination stripped. Split// functions are defined in this package for scanning a file into// lines, bytes, UTF-8-encoded runes, and space-delimited words. The// client may instead provide a custom split function.//// Scanning stops unrecoverably at EOF, the first I/O error, or a token too// large to fit in the buffer. When a scan stops, the reader may have// advanced arbitrarily far past the last token. Programs that need more// control over error handling or large tokens, or must run sequential scans// on a reader, should use bufio.Reader instead.type Scanner struct {r io.Reader // The reader provided by the client.split SplitFunc // The function to split the tokens.maxTokenSize int // Maximum size of a token; modified by tests.token []byte // Last token returned by split.buf []byte // Buffer used as argument to split.start int // First non-processed byte in buf.end int // End of data in buf.err error // Sticky error.empties int // Count of successive empty tokens.scanCalled bool // Scan has been called; buffer is in use.done bool // Scan has finished.}// SplitFunc is the signature of the split function used to tokenize the// input. The arguments are an initial substring of the remaining unprocessed// data and a flag, atEOF, that reports whether the Reader has no more data// to give. The return values are the number of bytes to advance the input// and the next token to return to the user, if any, plus an error, if any.//// Scanning stops if the function returns an error, in which case some of// the input may be discarded. If that error is ErrFinalToken, scanning// stops with no error.//// Otherwise, the Scanner advances the input. If the token is not nil,// the Scanner returns it to the user. If the token is nil, the// Scanner reads more data and continues scanning; if there is no more// data--if atEOF was true--the Scanner returns. If the data does not// yet hold a complete token, for instance if it has no newline while// scanning lines, a SplitFunc can return (0, nil, nil) to signal the// Scanner to read more data into the slice and try again with a// longer slice starting at the same point in the input.//// The function is never called with an empty data slice unless atEOF// is true. If atEOF is true, however, data may be non-empty and,// as always, holds unprocessed text.type SplitFunc func(data []byte, atEOF bool) (advance int, token []byte, err error)// Errors returned by Scanner.var (ErrTooLong = errors.New("bufio.Scanner: token too long")ErrNegativeAdvance = errors.New("bufio.Scanner: SplitFunc returns negative advance count")ErrAdvanceTooFar = errors.New("bufio.Scanner: SplitFunc returns advance count beyond input")ErrBadReadCount = errors.New("bufio.Scanner: Read returned impossible count"))const (// MaxScanTokenSize is the maximum size used to buffer a token// unless the user provides an explicit buffer with Scanner.Buffer.// The actual maximum token size may be smaller as the buffer// may need to include, for instance, a newline.MaxScanTokenSize = 64 * 1024startBufSize = 4096 // Size of initial allocation for buffer.)// NewScanner returns a new Scanner to read from r.// The split function defaults to ScanLines.func ( io.Reader) *Scanner {return &Scanner{r: ,split: ScanLines,maxTokenSize: MaxScanTokenSize,}}// Err returns the first non-EOF error that was encountered by the Scanner.func ( *Scanner) () error {if .err == io.EOF {return nil}return .err}// Bytes returns the most recent token generated by a call to Scan.// The underlying array may point to data that will be overwritten// by a subsequent call to Scan. It does no allocation.func ( *Scanner) () []byte {return .token}// Text returns the most recent token generated by a call to Scan// as a newly allocated string holding its bytes.func ( *Scanner) () string {return string(.token)}// ErrFinalToken is a special sentinel error value. It is intended to be// returned by a Split function to indicate that the token being delivered// with the error is the last token and scanning should stop after this one.// After ErrFinalToken is received by Scan, scanning stops with no error.// The value is useful to stop processing early or when it is necessary to// deliver a final empty token. One could achieve the same behavior// with a custom error value but providing one here is tidier.// See the emptyFinalToken example for a use of this value.var ErrFinalToken = errors.New("final token")// Scan advances the Scanner to the next token, which will then be// available through the Bytes or Text method. It returns false when the// scan stops, either by reaching the end of the input or an error.// After Scan returns false, the Err method will return any error that// occurred during scanning, except that if it was io.EOF, Err// will return nil.// Scan panics if the split function returns too many empty// tokens without advancing the input. This is a common error mode for// scanners.func ( *Scanner) () bool {if .done {return false}.scanCalled = true// Loop until we have a token.for {// See if we can get a token with what we already have.// If we've run out of data but have an error, give the split function// a chance to recover any remaining, possibly empty token.if .end > .start || .err != nil {, , := .split(.buf[.start:.end], .err != nil)if != nil {if == ErrFinalToken {.token =.done = truereturn true}.setErr()return false}if !.advance() {return false}.token =if != nil {if .err == nil || > 0 {.empties = 0} else {// Returning tokens not advancing input at EOF..empties++if .empties > maxConsecutiveEmptyReads {panic("bufio.Scan: too many empty tokens without progressing")}}return true}}// We cannot generate a token with what we are holding.// If we've already hit EOF or an I/O error, we are done.if .err != nil {// Shut it down..start = 0.end = 0return false}// Must read more data.// First, shift data to beginning of buffer if there's lots of empty space// or space is needed.if .start > 0 && (.end == len(.buf) || .start > len(.buf)/2) {copy(.buf, .buf[.start:.end]).end -= .start.start = 0}// Is the buffer full? If so, resize.if .end == len(.buf) {// Guarantee no overflow in the multiplication below.const = int(^uint(0) >> 1)if len(.buf) >= .maxTokenSize || len(.buf) > /2 {.setErr(ErrTooLong)return false}:= len(.buf) * 2if == 0 {= startBufSize}if > .maxTokenSize {= .maxTokenSize}:= make([]byte, )copy(, .buf[.start:.end]).buf =.end -= .start.start = 0}// Finally we can read some input. Make sure we don't get stuck with// a misbehaving Reader. Officially we don't need to do this, but let's// be extra careful: Scanner is for safe, simple jobs.for := 0; ; {, := .r.Read(.buf[.end:len(.buf)])if < 0 || len(.buf)-.end < {.setErr(ErrBadReadCount)break}.end +=if != nil {.setErr()break}if > 0 {.empties = 0break}++if > maxConsecutiveEmptyReads {.setErr(io.ErrNoProgress)break}}}}// advance consumes n bytes of the buffer. It reports whether the advance was legal.func ( *Scanner) ( int) bool {if < 0 {.setErr(ErrNegativeAdvance)return false}if > .end-.start {.setErr(ErrAdvanceTooFar)return false}.start +=return true}// setErr records the first error encountered.func ( *Scanner) ( error) {if .err == nil || .err == io.EOF {.err =}}// Buffer sets the initial buffer to use when scanning and the maximum// size of buffer that may be allocated during scanning. The maximum// token size is the larger of max and cap(buf). If max <= cap(buf),// Scan will use this buffer only and do no allocation.//// By default, Scan uses an internal buffer and sets the// maximum token size to MaxScanTokenSize.//// Buffer panics if it is called after scanning has started.func ( *Scanner) ( []byte, int) {if .scanCalled {panic("Buffer called after Scan")}.buf = [0:cap()].maxTokenSize =}// Split sets the split function for the Scanner.// The default split function is ScanLines.//// Split panics if it is called after scanning has started.func ( *Scanner) ( SplitFunc) {if .scanCalled {panic("Split called after Scan")}.split =}// Split functions// ScanBytes is a split function for a Scanner that returns each byte as a token.func ( []byte, bool) ( int, []byte, error) {if && len() == 0 {return 0, nil, nil}return 1, [0:1], nil}var errorRune = []byte(string(utf8.RuneError))// ScanRunes is a split function for a Scanner that returns each// UTF-8-encoded rune as a token. The sequence of runes returned is// equivalent to that from a range loop over the input as a string, which// means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd".// Because of the Scan interface, this makes it impossible for the client to// distinguish correctly encoded replacement runes from encoding errors.func ( []byte, bool) ( int, []byte, error) {if && len() == 0 {return 0, nil, nil}// Fast path 1: ASCII.if [0] < utf8.RuneSelf {return 1, [0:1], nil}// Fast path 2: Correct UTF-8 decode without error., := utf8.DecodeRune()if > 1 {// It's a valid encoding. Width cannot be one for a correctly encoded// non-ASCII rune.return , [0:], nil}// We know it's an error: we have width==1 and implicitly r==utf8.RuneError.// Is the error because there wasn't a full rune to be decoded?// FullRune distinguishes correctly between erroneous and incomplete encodings.if ! && !utf8.FullRune() {// Incomplete; get more bytes.return 0, nil, nil}// We have a real UTF-8 encoding error. Return a properly encoded error rune// but advance only one byte. This matches the behavior of a range loop over// an incorrectly encoded string.return 1, errorRune, nil}// dropCR drops a terminal \r from the data.func ( []byte) []byte {if len() > 0 && [len()-1] == '\r' {return [0 : len()-1]}return}// ScanLines is a split function for a Scanner that returns each line of// text, stripped of any trailing end-of-line marker. The returned line may// be empty. The end-of-line marker is one optional carriage return followed// by one mandatory newline. In regular expression notation, it is `\r?\n`.// The last non-empty line of input will be returned even if it has no// newline.func ( []byte, bool) ( int, []byte, error) {if && len() == 0 {return 0, nil, nil}if := bytes.IndexByte(, '\n'); >= 0 {// We have a full newline-terminated line.return + 1, dropCR([0:]), nil}// If we're at EOF, we have a final, non-terminated line. Return it.if {return len(), dropCR(), nil}// Request more data.return 0, nil, nil}// isSpace reports whether the character is a Unicode white space character.// We avoid dependency on the unicode package, but check validity of the implementation// in the tests.func ( rune) bool {if <= '\u00FF' {// Obvious ASCII ones: \t through \r plus space. Plus two Latin-1 oddballs.switch {case ' ', '\t', '\n', '\v', '\f', '\r':return truecase '\u0085', '\u00A0':return true}return false}// High-valued ones.if '\u2000' <= && <= '\u200a' {return true}switch {case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000':return true}return false}// ScanWords is a split function for a Scanner that returns each// space-separated word of text, with surrounding spaces deleted. It will// never return an empty string. The definition of space is set by// unicode.IsSpace.func ( []byte, bool) ( int, []byte, error) {// Skip leading spaces.:= 0for := 0; < len(); += {var rune, = utf8.DecodeRune([:])if !isSpace() {break}}// Scan until space, marking end of word.for , := 0, ; < len(); += {var rune, = utf8.DecodeRune([:])if isSpace() {return + , [:], nil}}// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.if && len() > {return len(), [:], nil}// Request more data.return , nil, nil}
![]() |
The pages are generated with Golds v0.6.7. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds. |