utf8string/string.go - exp - Git at Google

 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Package utf8string provides an efficient way to index strings by rune rather than by byte.
 package utf8string // import "golang.org/x/exp/utf8string"

 import (
 	"errors"
 	"unicode/utf8"
 )

 // String wraps a regular string with a small structure that provides more
 // efficient indexing by code point index, as opposed to byte index.
 // Scanning incrementally forwards or backwards is O(1) per index operation
 // (although not as fast a range clause going forwards).  Random access is
 // O(N) in the length of the string, but the overhead is less than always
 // scanning from the beginning.
 // If the string is ASCII, random access is O(1).
 // Unlike the built-in string type, String has internal mutable state and
 // is not thread-safe.
 type String struct {
 	str      string
 	numRunes int
 	// If width > 0, the rune at runePos starts at bytePos and has the specified width.
 	width    int
 	bytePos  int
 	runePos  int
 	nonASCII int // byte index of the first non-ASCII rune.
 }

 // NewString returns a new UTF-8 string with the provided contents.
 func NewString(contents string) *String {
 	return new(String).Init(contents)
 }

 // Init initializes an existing String to hold the provided contents.
 // It returns a pointer to the initialized String.
 func (s *String) Init(contents string) *String {
 	s.str = contents
 	s.bytePos = 0
 	s.runePos = 0
 	for i := 0; i < len(contents); i++ {
 		if contents[i] >= utf8.RuneSelf {
 			// Not ASCII.
 			s.numRunes = utf8.RuneCountInString(contents)
 			_, s.width = utf8.DecodeRuneInString(contents)
 			s.nonASCII = i
 			return s
 		}
 	}
 	// ASCII is simple.  Also, the empty string is ASCII.
 	s.numRunes = len(contents)
 	s.width = 0
 	s.nonASCII = len(contents)
 	return s
 }

 // String returns the contents of the String.  This method also means the
 // String is directly printable by fmt.Print.
 func (s *String) String() string {
 	return s.str
 }

 // RuneCount returns the number of runes (Unicode code points) in the String.
 func (s *String) RuneCount() int {
 	return s.numRunes
 }

 // IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
 func (s *String) IsASCII() bool {
 	return s.width == 0
 }

 // Slice returns the string sliced at rune positions [i:j].
 func (s *String) Slice(i, j int) string {
 	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
 	if j < s.nonASCII {
 		return s.str[i:j]
 	}
 	if i < 0 || j > s.numRunes || i > j {
 		panic(sliceOutOfRange)
 	}
 	if i == j {
 		return ""
 	}
 	// For non-ASCII, after At(i), bytePos is always the position of the indexed character.
 	var low, high int
 	switch {
 	case i < s.nonASCII:
 		low = i
 	case i == s.numRunes:
 		low = len(s.str)
 	default:
 		s.At(i)
 		low = s.bytePos
 	}
 	switch {
 	case j == s.numRunes:
 		high = len(s.str)
 	default:
 		s.At(j)
 		high = s.bytePos
 	}
 	return s.str[low:high]
 }

 // At returns the rune with index i in the String.  The sequence of runes is the same
 // as iterating over the contents with a "for range" clause.
 func (s *String) At(i int) rune {
 	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
 	if i < s.nonASCII {
 		return rune(s.str[i])
 	}

 	// Now we do need to know the index is valid.
 	if i < 0 || i >= s.numRunes {
 		panic(outOfRange)
 	}

 	var r rune

 	// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
 	// With these cases, all scans from beginning or end work in O(1) time per rune.
 	switch {

 	case i == s.runePos-1: // backing up one rune
 		r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
 		s.runePos = i
 		s.bytePos -= s.width
 		return r
 	case i == s.runePos+1: // moving ahead one rune
 		s.runePos = i
 		s.bytePos += s.width
 		fallthrough
 	case i == s.runePos:
 		r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
 		return r
 	case i == 0: // start of string
 		r, s.width = utf8.DecodeRuneInString(s.str)
 		s.runePos = 0
 		s.bytePos = 0
 		return r

 	case i == s.numRunes-1: // last rune in string
 		r, s.width = utf8.DecodeLastRuneInString(s.str)
 		s.runePos = i
 		s.bytePos = len(s.str) - s.width
 		return r
 	}

 	// We need to do a linear scan.  There are three places to start from:
 	// 1) The beginning
 	// 2) bytePos/runePos.
 	// 3) The end
 	// Choose the closest in rune count, scanning backwards if necessary.
 	forward := true
 	if i < s.runePos {
 		// Between beginning and pos.  Which is closer?
 		// Since both i and runePos are guaranteed >= nonASCII, that's the
 		// lowest location we need to start from.
 		if i < (s.runePos-s.nonASCII)/2 {
 			// Scan forward from beginning
 			s.bytePos, s.runePos = s.nonASCII, s.nonASCII
 		} else {
 			// Scan backwards from where we are
 			forward = false
 		}
 	} else {
 		// Between pos and end.  Which is closer?
 		if i-s.runePos < (s.numRunes-s.runePos)/2 {
 			// Scan forward from pos
 		} else {
 			// Scan backwards from end
 			s.bytePos, s.runePos = len(s.str), s.numRunes
 			forward = false
 		}
 	}
 	if forward {
 		// TODO: Is it much faster to use a range loop for this scan?
 		for {
 			r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
 			if s.runePos == i {
 				break
 			}
 			s.runePos++
 			s.bytePos += s.width
 		}
 	} else {
 		for {
 			r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
 			s.runePos--
 			s.bytePos -= s.width
 			if s.runePos == i {
 				break
 			}
 		}
 	}
 	return r
 }

 var outOfRange = errors.New("utf8string: index out of range")
 var sliceOutOfRange = errors.New("utf8string: slice index out of range")
	// Copyright 2009 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// Package utf8string provides an efficient way to index strings by rune rather than by byte.
	package utf8string // import "golang.org/x/exp/utf8string"

	import (
	"errors"
	"unicode/utf8"
	)

	// String wraps a regular string with a small structure that provides more
	// efficient indexing by code point index, as opposed to byte index.
	// Scanning incrementally forwards or backwards is O(1) per index operation
	// (although not as fast a range clause going forwards). Random access is
	// O(N) in the length of the string, but the overhead is less than always
	// scanning from the beginning.
	// If the string is ASCII, random access is O(1).
	// Unlike the built-in string type, String has internal mutable state and
	// is not thread-safe.
	type String struct {
	str string
	numRunes int
	// If width > 0, the rune at runePos starts at bytePos and has the specified width.
	width int
	bytePos int
	runePos int
	nonASCII int // byte index of the first non-ASCII rune.
	}

	// NewString returns a new UTF-8 string with the provided contents.
	func NewString(contents string) *String {
	return new(String).Init(contents)
	}

	// Init initializes an existing String to hold the provided contents.
	// It returns a pointer to the initialized String.
	func (s String) Init(contents string) String {
	s.str = contents
	s.bytePos = 0
	s.runePos = 0
	for i := 0; i < len(contents); i++ {
	if contents[i] >= utf8.RuneSelf {
	// Not ASCII.
	s.numRunes = utf8.RuneCountInString(contents)
	_, s.width = utf8.DecodeRuneInString(contents)
	s.nonASCII = i
	return s
	}
	}
	// ASCII is simple. Also, the empty string is ASCII.
	s.numRunes = len(contents)
	s.width = 0
	s.nonASCII = len(contents)
	return s
	}

	// String returns the contents of the String. This method also means the
	// String is directly printable by fmt.Print.
	func (s *String) String() string {
	return s.str
	}

	// RuneCount returns the number of runes (Unicode code points) in the String.
	func (s *String) RuneCount() int {
	return s.numRunes
	}

	// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
	func (s *String) IsASCII() bool {
	return s.width == 0
	}

	// Slice returns the string sliced at rune positions [i:j].
	func (s *String) Slice(i, j int) string {
	// ASCII is easy. Let the compiler catch the indexing error if there is one.
	if j < s.nonASCII {
	return s.str[i:j]
	}
	if i < 0 \|\| j > s.numRunes \|\| i > j {
	panic(sliceOutOfRange)
	}
	if i == j {
	return ""
	}
	// For non-ASCII, after At(i), bytePos is always the position of the indexed character.
	var low, high int
	switch {
	case i < s.nonASCII:
	low = i
	case i == s.numRunes:
	low = len(s.str)
	default:
	s.At(i)
	low = s.bytePos
	}
	switch {
	case j == s.numRunes:
	high = len(s.str)
	default:
	s.At(j)
	high = s.bytePos
	}
	return s.str[low:high]
	}

	// At returns the rune with index i in the String. The sequence of runes is the same
	// as iterating over the contents with a "for range" clause.
	func (s *String) At(i int) rune {
	// ASCII is easy. Let the compiler catch the indexing error if there is one.
	if i < s.nonASCII {
	return rune(s.str[i])
	}

	// Now we do need to know the index is valid.
	if i < 0 \|\| i >= s.numRunes {
	panic(outOfRange)
	}

	var r rune

	// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
	// With these cases, all scans from beginning or end work in O(1) time per rune.
	switch {

	case i == s.runePos-1: // backing up one rune
	r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
	s.runePos = i
	s.bytePos -= s.width
	return r
	case i == s.runePos+1: // moving ahead one rune
	s.runePos = i
	s.bytePos += s.width
	fallthrough
	case i == s.runePos:
	r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
	return r
	case i == 0: // start of string
	r, s.width = utf8.DecodeRuneInString(s.str)
	s.runePos = 0
	s.bytePos = 0
	return r

	case i == s.numRunes-1: // last rune in string
	r, s.width = utf8.DecodeLastRuneInString(s.str)
	s.runePos = i
	s.bytePos = len(s.str) - s.width
	return r
	}

	// We need to do a linear scan. There are three places to start from:
	// 1) The beginning
	// 2) bytePos/runePos.
	// 3) The end
	// Choose the closest in rune count, scanning backwards if necessary.
	forward := true
	if i < s.runePos {
	// Between beginning and pos. Which is closer?
	// Since both i and runePos are guaranteed >= nonASCII, that's the
	// lowest location we need to start from.
	if i < (s.runePos-s.nonASCII)/2 {
	// Scan forward from beginning
	s.bytePos, s.runePos = s.nonASCII, s.nonASCII
	} else {
	// Scan backwards from where we are
	forward = false
	}
	} else {
	// Between pos and end. Which is closer?
	if i-s.runePos < (s.numRunes-s.runePos)/2 {
	// Scan forward from pos
	} else {
	// Scan backwards from end
	s.bytePos, s.runePos = len(s.str), s.numRunes
	forward = false
	}
	}
	if forward {
	// TODO: Is it much faster to use a range loop for this scan?
	for {
	r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
	if s.runePos == i {
	break
	}
	s.runePos++
	s.bytePos += s.width
	}
	} else {
	for {
	r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
	s.runePos--
	s.bytePos -= s.width
	if s.runePos == i {
	break
	}
	}
	}
	return r
	}

	var outOfRange = errors.New("utf8string: index out of range")
	var sliceOutOfRange = errors.New("utf8string: slice index out of range")