encoding/encoding.go - text - Git at Google

 // Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Package encoding defines an interface for character encodings, such as Shift
 // JIS and Windows 1252, that can convert to and from UTF-8.
 //
 // To convert the bytes of an io.Reader r from the encoding e to UTF-8:
 //	rInUTF8 := transform.NewReader(r, e.NewDecoder())
 // and to convert from UTF-8 to the encoding e:
 //	wInUTF8 := transform.NewWriter(w, e.NewEncoder())
 // In both cases, import "golang.org/x/text/transform".
 //
 // Encoding implementations are provided in other packages, such as
 // golang.org/x/text/encoding/charmap and
 // golang.org/x/text/encoding/japanese.
 package encoding // import "golang.org/x/text/encoding"

 import (
 	"errors"
 	"unicode/utf8"

 	"golang.org/x/text/transform"
 )

 // Encoding is a character set encoding that can be transformed to and from
 // UTF-8.
 type Encoding interface {
 	// NewDecoder returns a transformer that converts to UTF-8.
 	//
 	// Transforming source bytes that are not of that encoding will not
 	// result in an error per se. Each byte that cannot be transcoded will
 	// be represented in the output by the UTF-8 encoding of '\uFFFD', the
 	// replacement rune.
 	NewDecoder() transform.Transformer

 	// NewEncoder returns a transformer that converts from UTF-8.
 	//
 	// Transforming source bytes that are not valid UTF-8 will not result in
 	// an error per se. Each rune that cannot be transcoded will be
 	// represented in the output by an encoding-specific replacement such as
 	// "\x1a" (the ASCII substitute character) or "\xff\xfd". To return
 	// early with error instead, use transform.Chain to preprocess the data
 	// with a UTF8Validator.
 	NewEncoder() transform.Transformer
 }

 // ASCIISub is the ASCII substitute character, as recommended by
 // http://unicode.org/reports/tr36/#Text_Comparison
 const ASCIISub = '\x1a'

 // Nop is the nop encoding. Its transformed bytes are the same as the source
 // bytes; it does not replace invalid UTF-8 sequences.
 var Nop Encoding = nop{}

 type nop struct{}

 func (nop) NewDecoder() transform.Transformer {
 	return transform.Nop
 }

 func (nop) NewEncoder() transform.Transformer {
 	return transform.Nop
 }

 // Replacement is the replacement encoding. Decoding from the replacement
 // encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
 // the replacement encoding yields the same as the source bytes except that
 // invalid UTF-8 is converted to '\uFFFD'.
 //
 // It is defined at http://encoding.spec.whatwg.org/#replacement
 var Replacement Encoding = replacement{}

 type replacement struct{}

 func (replacement) NewDecoder() transform.Transformer {
 	return replacementDecoder{}
 }

 func (replacement) NewEncoder() transform.Transformer {
 	return replacementEncoder{}
 }

 type replacementDecoder struct{ transform.NopResetter }

 func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 	if len(dst) < 3 {
 		return 0, 0, transform.ErrShortDst
 	}
 	if atEOF {
 		const fffd = "\ufffd"
 		dst[0] = fffd[0]
 		dst[1] = fffd[1]
 		dst[2] = fffd[2]
 		nDst = 3
 	}
 	return nDst, len(src), nil
 }

 type replacementEncoder struct{ transform.NopResetter }

 func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 	r, size := rune(0), 0

 	for ; nSrc < len(src); nSrc += size {
 		r = rune(src[nSrc])

 		// Decode a 1-byte rune.
 		if r < utf8.RuneSelf {
 			size = 1

 		} else {
 			// Decode a multi-byte rune.
 			r, size = utf8.DecodeRune(src[nSrc:])
 			if size == 1 {
 				// All valid runes of size 1 (those below utf8.RuneSelf) were
 				// handled above. We have invalid UTF-8 or we haven't seen the
 				// full character yet.
 				if !atEOF && !utf8.FullRune(src[nSrc:]) {
 					err = transform.ErrShortSrc
 					break
 				}
 				r = '\ufffd'
 			}
 		}

 		if nDst+utf8.RuneLen(r) > len(dst) {
 			err = transform.ErrShortDst
 			break
 		}
 		nDst += utf8.EncodeRune(dst[nDst:], r)
 	}
 	return nDst, nSrc, err
 }

 // ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
 var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")

 // UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
 // input byte that is not valid UTF-8.
 var UTF8Validator transform.Transformer = utf8Validator{}

 type utf8Validator struct{ transform.NopResetter }

 func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 	n := len(src)
 	if n > len(dst) {
 		n = len(dst)
 	}
 	for i := 0; i < n; {
 		if c := src[i]; c < utf8.RuneSelf {
 			dst[i] = c
 			i++
 			continue
 		}
 		_, size := utf8.DecodeRune(src[i:])
 		if size == 1 {
 			// All valid runes of size 1 (those below utf8.RuneSelf) were
 			// handled above. We have invalid UTF-8 or we haven't seen the
 			// full character yet.
 			err = ErrInvalidUTF8
 			if !atEOF && !utf8.FullRune(src[i:]) {
 				err = transform.ErrShortSrc
 			}
 			return i, i, err
 		}
 		if i+size > len(dst) {
 			return i, i, transform.ErrShortDst
 		}
 		for ; size > 0; size-- {
 			dst[i] = src[i]
 			i++
 		}
 	}
 	if len(src) > len(dst) {
 		err = transform.ErrShortDst
 	}
 	return n, n, err
 }
	// Copyright 2013 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// Package encoding defines an interface for character encodings, such as Shift
	// JIS and Windows 1252, that can convert to and from UTF-8.
	//
	// To convert the bytes of an io.Reader r from the encoding e to UTF-8:
	// rInUTF8 := transform.NewReader(r, e.NewDecoder())
	// and to convert from UTF-8 to the encoding e:
	// wInUTF8 := transform.NewWriter(w, e.NewEncoder())
	// In both cases, import "golang.org/x/text/transform".
	//
	// Encoding implementations are provided in other packages, such as
	// golang.org/x/text/encoding/charmap and
	// golang.org/x/text/encoding/japanese.
	package encoding // import "golang.org/x/text/encoding"

	import (
	"errors"
	"unicode/utf8"

	"golang.org/x/text/transform"
	)

	// Encoding is a character set encoding that can be transformed to and from
	// UTF-8.
	type Encoding interface {
	// NewDecoder returns a transformer that converts to UTF-8.
	//
	// Transforming source bytes that are not of that encoding will not
	// result in an error per se. Each byte that cannot be transcoded will
	// be represented in the output by the UTF-8 encoding of '\uFFFD', the
	// replacement rune.
	NewDecoder() transform.Transformer

	// NewEncoder returns a transformer that converts from UTF-8.
	//
	// Transforming source bytes that are not valid UTF-8 will not result in
	// an error per se. Each rune that cannot be transcoded will be
	// represented in the output by an encoding-specific replacement such as
	// "\x1a" (the ASCII substitute character) or "\xff\xfd". To return
	// early with error instead, use transform.Chain to preprocess the data
	// with a UTF8Validator.
	NewEncoder() transform.Transformer
	}

	// ASCIISub is the ASCII substitute character, as recommended by
	// http://unicode.org/reports/tr36/#Text_Comparison
	const ASCIISub = '\x1a'

	// Nop is the nop encoding. Its transformed bytes are the same as the source
	// bytes; it does not replace invalid UTF-8 sequences.
	var Nop Encoding = nop{}

	type nop struct{}

	func (nop) NewDecoder() transform.Transformer {
	return transform.Nop
	}

	func (nop) NewEncoder() transform.Transformer {
	return transform.Nop
	}

	// Replacement is the replacement encoding. Decoding from the replacement
	// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
	// the replacement encoding yields the same as the source bytes except that
	// invalid UTF-8 is converted to '\uFFFD'.
	//
	// It is defined at http://encoding.spec.whatwg.org/#replacement
	var Replacement Encoding = replacement{}

	type replacement struct{}

	func (replacement) NewDecoder() transform.Transformer {
	return replacementDecoder{}
	}

	func (replacement) NewEncoder() transform.Transformer {
	return replacementEncoder{}
	}

	type replacementDecoder struct{ transform.NopResetter }

	func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	if len(dst) < 3 {
	return 0, 0, transform.ErrShortDst
	}
	if atEOF {
	const fffd = "\ufffd"
	dst[0] = fffd[0]
	dst[1] = fffd[1]
	dst[2] = fffd[2]
	nDst = 3
	}
	return nDst, len(src), nil
	}

	type replacementEncoder struct{ transform.NopResetter }

	func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	r, size := rune(0), 0

	for ; nSrc < len(src); nSrc += size {
	r = rune(src[nSrc])

	// Decode a 1-byte rune.
	if r < utf8.RuneSelf {
	size = 1

	} else {
	// Decode a multi-byte rune.
	r, size = utf8.DecodeRune(src[nSrc:])
	if size == 1 {
	// All valid runes of size 1 (those below utf8.RuneSelf) were
	// handled above. We have invalid UTF-8 or we haven't seen the
	// full character yet.
	if !atEOF && !utf8.FullRune(src[nSrc:]) {
	err = transform.ErrShortSrc
	break
	}
	r = '\ufffd'
	}
	}

	if nDst+utf8.RuneLen(r) > len(dst) {
	err = transform.ErrShortDst
	break
	}
	nDst += utf8.EncodeRune(dst[nDst:], r)
	}
	return nDst, nSrc, err
	}

	// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
	var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")

	// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
	// input byte that is not valid UTF-8.
	var UTF8Validator transform.Transformer = utf8Validator{}

	type utf8Validator struct{ transform.NopResetter }

	func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	n := len(src)
	if n > len(dst) {
	n = len(dst)
	}
	for i := 0; i < n; {
	if c := src[i]; c < utf8.RuneSelf {
	dst[i] = c
	i++
	continue
	}
	_, size := utf8.DecodeRune(src[i:])
	if size == 1 {
	// All valid runes of size 1 (those below utf8.RuneSelf) were
	// handled above. We have invalid UTF-8 or we haven't seen the
	// full character yet.
	err = ErrInvalidUTF8
	if !atEOF && !utf8.FullRune(src[i:]) {
	err = transform.ErrShortSrc
	}
	return i, i, err
	}
	if i+size > len(dst) {
	return i, i, transform.ErrShortDst
	}
	for ; size > 0; size-- {
	dst[i] = src[i]
	i++
	}
	}
	if len(src) > len(dst) {
	err = transform.ErrShortDst
	}
	return n, n, err
	}