libgo/go/unicode/letter.go - gofrontend - Git at Google

 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Package unicode provides data and functions to test some properties of
 // Unicode code points.
 package unicode

 // Tables are regenerated each time we update the Unicode version.
 //go:generate go run maketables.go -tables=all -output tables.go

 const (
 	MaxRune         = '\U0010FFFF' // Maximum valid Unicode code point.
 	ReplacementChar = '\uFFFD'     // Represents invalid code points.
 	MaxASCII        = '\u007F'     // maximum ASCII value.
 	MaxLatin1       = '\u00FF'     // maximum Latin-1 value.
 )

 // RangeTable defines a set of Unicode code points by listing the ranges of
 // code points within the set. The ranges are listed in two slices
 // to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
 // The two slices must be in sorted order and non-overlapping.
 // Also, R32 should contain only values >= 0x10000 (1<<16).
 type RangeTable struct {
 	R16         []Range16
 	R32         []Range32
 	LatinOffset int // number of entries in R16 with Hi <= MaxLatin1
 }

 // Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi
 // inclusive and has the specified stride.
 type Range16 struct {
 	Lo     uint16
 	Hi     uint16
 	Stride uint16
 }

 // Range32 represents of a range of Unicode code points and is used when one or
 // more of the values will not fit in 16 bits. The range runs from Lo to Hi
 // inclusive and has the specified stride. Lo and Hi must always be >= 1<<16.
 type Range32 struct {
 	Lo     uint32
 	Hi     uint32
 	Stride uint32
 }

 // CaseRange represents a range of Unicode code points for simple (one
 // code point to one code point) case conversion.
 // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas
 // are the number to add to the code point to reach the code point for a
 // different case for that character. They may be negative. If zero, it
 // means the character is in the corresponding case. There is a special
 // case representing sequences of alternating corresponding Upper and Lower
 // pairs. It appears with a fixed Delta of
 //	{UpperLower, UpperLower, UpperLower}
 // The constant UpperLower has an otherwise impossible delta value.
 type CaseRange struct {
 	Lo    uint32
 	Hi    uint32
 	Delta d
 }

 // SpecialCase represents language-specific case mappings such as Turkish.
 // Methods of SpecialCase customize (by overriding) the standard mappings.
 type SpecialCase []CaseRange

 // BUG(r): There is no mechanism for full case folding, that is, for
 // characters that involve multiple runes in the input or output.

 // Indices into the Delta arrays inside CaseRanges for case mapping.
 const (
 	UpperCase = iota
 	LowerCase
 	TitleCase
 	MaxCase
 )

 type d [MaxCase]rune // to make the CaseRanges text shorter

 // If the Delta field of a CaseRange is UpperLower, it means
 // this CaseRange represents a sequence of the form (say)
 // Upper Lower Upper Lower.
 const (
 	UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
 )

 // linearMax is the maximum size table for linear search for non-Latin1 rune.
 // Derived by running 'go test -calibrate'.
 const linearMax = 18

 // is16 reports whether r is in the sorted slice of 16-bit ranges.
 func is16(ranges []Range16, r uint16) bool {
 	if len(ranges) <= linearMax || r <= MaxLatin1 {
 		for i := range ranges {
 			range_ := &ranges[i]
 			if r < range_.Lo {
 				return false
 			}
 			if r <= range_.Hi {
 				return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
 			}
 		}
 		return false
 	}

 	// binary search over ranges
 	lo := 0
 	hi := len(ranges)
 	for lo < hi {
 		m := lo + (hi-lo)/2
 		range_ := &ranges[m]
 		if range_.Lo <= r && r <= range_.Hi {
 			return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
 		}
 		if r < range_.Lo {
 			hi = m
 		} else {
 			lo = m + 1
 		}
 	}
 	return false
 }

 // is32 reports whether r is in the sorted slice of 32-bit ranges.
 func is32(ranges []Range32, r uint32) bool {
 	if len(ranges) <= linearMax {
 		for i := range ranges {
 			range_ := &ranges[i]
 			if r < range_.Lo {
 				return false
 			}
 			if r <= range_.Hi {
 				return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
 			}
 		}
 		return false
 	}

 	// binary search over ranges
 	lo := 0
 	hi := len(ranges)
 	for lo < hi {
 		m := lo + (hi-lo)/2
 		range_ := ranges[m]
 		if range_.Lo <= r && r <= range_.Hi {
 			return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
 		}
 		if r < range_.Lo {
 			hi = m
 		} else {
 			lo = m + 1
 		}
 	}
 	return false
 }

 // Is reports whether the rune is in the specified table of ranges.
 func Is(rangeTab *RangeTable, r rune) bool {
 	r16 := rangeTab.R16
 	if len(r16) > 0 && r <= rune(r16[len(r16)-1].Hi) {
 		return is16(r16, uint16(r))
 	}
 	r32 := rangeTab.R32
 	if len(r32) > 0 && r >= rune(r32[0].Lo) {
 		return is32(r32, uint32(r))
 	}
 	return false
 }

 func isExcludingLatin(rangeTab *RangeTable, r rune) bool {
 	r16 := rangeTab.R16
 	if off := rangeTab.LatinOffset; len(r16) > off && r <= rune(r16[len(r16)-1].Hi) {
 		return is16(r16[off:], uint16(r))
 	}
 	r32 := rangeTab.R32
 	if len(r32) > 0 && r >= rune(r32[0].Lo) {
 		return is32(r32, uint32(r))
 	}
 	return false
 }

 // IsUpper reports whether the rune is an upper case letter.
 func IsUpper(r rune) bool {
 	// See comment in IsGraphic.
 	if uint32(r) <= MaxLatin1 {
 		return properties[uint8(r)]&pLmask == pLu
 	}
 	return isExcludingLatin(Upper, r)
 }

 // IsLower reports whether the rune is a lower case letter.
 func IsLower(r rune) bool {
 	// See comment in IsGraphic.
 	if uint32(r) <= MaxLatin1 {
 		return properties[uint8(r)]&pLmask == pLl
 	}
 	return isExcludingLatin(Lower, r)
 }

 // IsTitle reports whether the rune is a title case letter.
 func IsTitle(r rune) bool {
 	if r <= MaxLatin1 {
 		return false
 	}
 	return isExcludingLatin(Title, r)
 }

 // to maps the rune using the specified case mapping.
 func to(_case int, r rune, caseRange []CaseRange) rune {
 	if _case < 0 || MaxCase <= _case {
 		return ReplacementChar // as reasonable an error as any
 	}
 	// binary search over ranges
 	lo := 0
 	hi := len(caseRange)
 	for lo < hi {
 		m := lo + (hi-lo)/2
 		cr := caseRange[m]
 		if rune(cr.Lo) <= r && r <= rune(cr.Hi) {
 			delta := cr.Delta[_case]
 			if delta > MaxRune {
 				// In an Upper-Lower sequence, which always starts with
 				// an UpperCase letter, the real deltas always look like:
 				//	{0, 1, 0}    UpperCase (Lower is next)
 				//	{-1, 0, -1}  LowerCase (Upper, Title are previous)
 				// The characters at even offsets from the beginning of the
 				// sequence are upper case; the ones at odd offsets are lower.
 				// The correct mapping can be done by clearing or setting the low
 				// bit in the sequence offset.
 				// The constants UpperCase and TitleCase are even while LowerCase
 				// is odd so we take the low bit from _case.
 				return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1))
 			}
 			return r + delta
 		}
 		if r < rune(cr.Lo) {
 			hi = m
 		} else {
 			lo = m + 1
 		}
 	}
 	return r
 }

 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
 func To(_case int, r rune) rune {
 	return to(_case, r, CaseRanges)
 }

 // ToUpper maps the rune to upper case.
 func ToUpper(r rune) rune {
 	if r <= MaxASCII {
 		if 'a' <= r && r <= 'z' {
 			r -= 'a' - 'A'
 		}
 		return r
 	}
 	return To(UpperCase, r)
 }

 // ToLower maps the rune to lower case.
 func ToLower(r rune) rune {
 	if r <= MaxASCII {
 		if 'A' <= r && r <= 'Z' {
 			r += 'a' - 'A'
 		}
 		return r
 	}
 	return To(LowerCase, r)
 }

 // ToTitle maps the rune to title case.
 func ToTitle(r rune) rune {
 	if r <= MaxASCII {
 		if 'a' <= r && r <= 'z' { // title case is upper case for ASCII
 			r -= 'a' - 'A'
 		}
 		return r
 	}
 	return To(TitleCase, r)
 }

 // ToUpper maps the rune to upper case giving priority to the special mapping.
 func (special SpecialCase) ToUpper(r rune) rune {
 	r1 := to(UpperCase, r, []CaseRange(special))
 	if r1 == r {
 		r1 = ToUpper(r)
 	}
 	return r1
 }

 // ToTitle maps the rune to title case giving priority to the special mapping.
 func (special SpecialCase) ToTitle(r rune) rune {
 	r1 := to(TitleCase, r, []CaseRange(special))
 	if r1 == r {
 		r1 = ToTitle(r)
 	}
 	return r1
 }

 // ToLower maps the rune to lower case giving priority to the special mapping.
 func (special SpecialCase) ToLower(r rune) rune {
 	r1 := to(LowerCase, r, []CaseRange(special))
 	if r1 == r {
 		r1 = ToLower(r)
 	}
 	return r1
 }

 // caseOrbit is defined in tables.go as []foldPair. Right now all the
 // entries fit in uint16, so use uint16. If that changes, compilation
 // will fail (the constants in the composite literal will not fit in uint16)
 // and the types here can change to uint32.
 type foldPair struct {
 	From uint16
 	To   uint16
 }

 // SimpleFold iterates over Unicode code points equivalent under
 // the Unicode-defined simple case folding. Among the code points
 // equivalent to rune (including rune itself), SimpleFold returns the
 // smallest rune > r if one exists, or else the smallest rune >= 0.
 // If r is not a valid Unicode code point, SimpleFold(r) returns r.
 //
 // For example:
 //	SimpleFold('A') = 'a'
 //	SimpleFold('a') = 'A'
 //
 //	SimpleFold('K') = 'k'
 //	SimpleFold('k') = '\u212A' (Kelvin symbol, K)
 //	SimpleFold('\u212A') = 'K'
 //
 //	SimpleFold('1') = '1'
 //
 //	SimpleFold(-2) = -2
 //
 func SimpleFold(r rune) rune {
 	if r < 0 || r > MaxRune {
 		return r
 	}

 	if int(r) < len(asciiFold) {
 		return rune(asciiFold[r])
 	}

 	// Consult caseOrbit table for special cases.
 	lo := 0
 	hi := len(caseOrbit)
 	for lo < hi {
 		m := lo + (hi-lo)/2
 		if rune(caseOrbit[m].From) < r {
 			lo = m + 1
 		} else {
 			hi = m
 		}
 	}
 	if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r {
 		return rune(caseOrbit[lo].To)
 	}

 	// No folding specified. This is a one- or two-element
 	// equivalence class containing rune and ToLower(rune)
 	// and ToUpper(rune) if they are different from rune.
 	if l := ToLower(r); l != r {
 		return l
 	}
 	return ToUpper(r)
 }
	// Copyright 2009 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// Package unicode provides data and functions to test some properties of
	// Unicode code points.
	package unicode

	// Tables are regenerated each time we update the Unicode version.
	//go:generate go run maketables.go -tables=all -output tables.go

	const (
	MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
	ReplacementChar = '\uFFFD' // Represents invalid code points.
	MaxASCII = '\u007F' // maximum ASCII value.
	MaxLatin1 = '\u00FF' // maximum Latin-1 value.
	)

	// RangeTable defines a set of Unicode code points by listing the ranges of
	// code points within the set. The ranges are listed in two slices
	// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
	// The two slices must be in sorted order and non-overlapping.
	// Also, R32 should contain only values >= 0x10000 (1<<16).
	type RangeTable struct {
	R16 []Range16
	R32 []Range32
	LatinOffset int // number of entries in R16 with Hi <= MaxLatin1
	}

	// Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi
	// inclusive and has the specified stride.
	type Range16 struct {
	Lo uint16
	Hi uint16
	Stride uint16
	}

	// Range32 represents of a range of Unicode code points and is used when one or
	// more of the values will not fit in 16 bits. The range runs from Lo to Hi
	// inclusive and has the specified stride. Lo and Hi must always be >= 1<<16.
	type Range32 struct {
	Lo uint32
	Hi uint32
	Stride uint32
	}

	// CaseRange represents a range of Unicode code points for simple (one
	// code point to one code point) case conversion.
	// The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas
	// are the number to add to the code point to reach the code point for a
	// different case for that character. They may be negative. If zero, it
	// means the character is in the corresponding case. There is a special
	// case representing sequences of alternating corresponding Upper and Lower
	// pairs. It appears with a fixed Delta of
	// {UpperLower, UpperLower, UpperLower}
	// The constant UpperLower has an otherwise impossible delta value.
	type CaseRange struct {
	Lo uint32
	Hi uint32
	Delta d
	}

	// SpecialCase represents language-specific case mappings such as Turkish.
	// Methods of SpecialCase customize (by overriding) the standard mappings.
	type SpecialCase []CaseRange

	// BUG(r): There is no mechanism for full case folding, that is, for
	// characters that involve multiple runes in the input or output.

	// Indices into the Delta arrays inside CaseRanges for case mapping.
	const (
	UpperCase = iota
	LowerCase
	TitleCase
	MaxCase
	)

	type d [MaxCase]rune // to make the CaseRanges text shorter

	// If the Delta field of a CaseRange is UpperLower, it means
	// this CaseRange represents a sequence of the form (say)
	// Upper Lower Upper Lower.
	const (
	UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
	)

	// linearMax is the maximum size table for linear search for non-Latin1 rune.
	// Derived by running 'go test -calibrate'.
	const linearMax = 18

	// is16 reports whether r is in the sorted slice of 16-bit ranges.
	func is16(ranges []Range16, r uint16) bool {
	if len(ranges) <= linearMax \|\| r <= MaxLatin1 {
	for i := range ranges {
	range_ := &ranges[i]
	if r < range_.Lo {
	return false
	}
	if r <= range_.Hi {
	return range_.Stride == 1 \|\| (r-range_.Lo)%range_.Stride == 0
	}
	}
	return false
	}

	// binary search over ranges
	lo := 0
	hi := len(ranges)
	for lo < hi {
	m := lo + (hi-lo)/2
	range_ := &ranges[m]
	if range_.Lo <= r && r <= range_.Hi {
	return range_.Stride == 1 \|\| (r-range_.Lo)%range_.Stride == 0
	}
	if r < range_.Lo {
	hi = m
	} else {
	lo = m + 1
	}
	}
	return false
	}

	// is32 reports whether r is in the sorted slice of 32-bit ranges.
	func is32(ranges []Range32, r uint32) bool {
	if len(ranges) <= linearMax {
	for i := range ranges {
	range_ := &ranges[i]
	if r < range_.Lo {
	return false
	}
	if r <= range_.Hi {
	return range_.Stride == 1 \|\| (r-range_.Lo)%range_.Stride == 0
	}
	}
	return false
	}

	// binary search over ranges
	lo := 0
	hi := len(ranges)
	for lo < hi {
	m := lo + (hi-lo)/2
	range_ := ranges[m]
	if range_.Lo <= r && r <= range_.Hi {
	return range_.Stride == 1 \|\| (r-range_.Lo)%range_.Stride == 0
	}
	if r < range_.Lo {
	hi = m
	} else {
	lo = m + 1
	}
	}
	return false
	}

	// Is reports whether the rune is in the specified table of ranges.
	func Is(rangeTab *RangeTable, r rune) bool {
	r16 := rangeTab.R16
	if len(r16) > 0 && r <= rune(r16[len(r16)-1].Hi) {
	return is16(r16, uint16(r))
	}
	r32 := rangeTab.R32
	if len(r32) > 0 && r >= rune(r32[0].Lo) {
	return is32(r32, uint32(r))
	}
	return false
	}

	func isExcludingLatin(rangeTab *RangeTable, r rune) bool {
	r16 := rangeTab.R16
	if off := rangeTab.LatinOffset; len(r16) > off && r <= rune(r16[len(r16)-1].Hi) {
	return is16(r16[off:], uint16(r))
	}
	r32 := rangeTab.R32
	if len(r32) > 0 && r >= rune(r32[0].Lo) {
	return is32(r32, uint32(r))
	}
	return false
	}

	// IsUpper reports whether the rune is an upper case letter.
	func IsUpper(r rune) bool {
	// See comment in IsGraphic.
	if uint32(r) <= MaxLatin1 {
	return properties[uint8(r)]&pLmask == pLu
	}
	return isExcludingLatin(Upper, r)
	}

	// IsLower reports whether the rune is a lower case letter.
	func IsLower(r rune) bool {
	// See comment in IsGraphic.
	if uint32(r) <= MaxLatin1 {
	return properties[uint8(r)]&pLmask == pLl
	}
	return isExcludingLatin(Lower, r)
	}

	// IsTitle reports whether the rune is a title case letter.
	func IsTitle(r rune) bool {
	if r <= MaxLatin1 {
	return false
	}
	return isExcludingLatin(Title, r)
	}

	// to maps the rune using the specified case mapping.
	func to(_case int, r rune, caseRange []CaseRange) rune {
	if _case < 0 \|\| MaxCase <= _case {
	return ReplacementChar // as reasonable an error as any
	}
	// binary search over ranges
	lo := 0
	hi := len(caseRange)
	for lo < hi {
	m := lo + (hi-lo)/2
	cr := caseRange[m]
	if rune(cr.Lo) <= r && r <= rune(cr.Hi) {
	delta := cr.Delta[_case]
	if delta > MaxRune {
	// In an Upper-Lower sequence, which always starts with
	// an UpperCase letter, the real deltas always look like:
	// {0, 1, 0} UpperCase (Lower is next)
	// {-1, 0, -1} LowerCase (Upper, Title are previous)
	// The characters at even offsets from the beginning of the
	// sequence are upper case; the ones at odd offsets are lower.
	// The correct mapping can be done by clearing or setting the low
	// bit in the sequence offset.
	// The constants UpperCase and TitleCase are even while LowerCase
	// is odd so we take the low bit from _case.
	return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 \| rune(_case&1))
	}
	return r + delta
	}
	if r < rune(cr.Lo) {
	hi = m
	} else {
	lo = m + 1
	}
	}
	return r
	}

	// To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
	func To(_case int, r rune) rune {
	return to(_case, r, CaseRanges)
	}

	// ToUpper maps the rune to upper case.
	func ToUpper(r rune) rune {
	if r <= MaxASCII {
	if 'a' <= r && r <= 'z' {
	r -= 'a' - 'A'
	}
	return r
	}
	return To(UpperCase, r)
	}

	// ToLower maps the rune to lower case.
	func ToLower(r rune) rune {
	if r <= MaxASCII {
	if 'A' <= r && r <= 'Z' {
	r += 'a' - 'A'
	}
	return r
	}
	return To(LowerCase, r)
	}

	// ToTitle maps the rune to title case.
	func ToTitle(r rune) rune {
	if r <= MaxASCII {
	if 'a' <= r && r <= 'z' { // title case is upper case for ASCII
	r -= 'a' - 'A'
	}
	return r
	}
	return To(TitleCase, r)
	}

	// ToUpper maps the rune to upper case giving priority to the special mapping.
	func (special SpecialCase) ToUpper(r rune) rune {
	r1 := to(UpperCase, r, []CaseRange(special))
	if r1 == r {
	r1 = ToUpper(r)
	}
	return r1
	}

	// ToTitle maps the rune to title case giving priority to the special mapping.
	func (special SpecialCase) ToTitle(r rune) rune {
	r1 := to(TitleCase, r, []CaseRange(special))
	if r1 == r {
	r1 = ToTitle(r)
	}
	return r1
	}

	// ToLower maps the rune to lower case giving priority to the special mapping.
	func (special SpecialCase) ToLower(r rune) rune {
	r1 := to(LowerCase, r, []CaseRange(special))
	if r1 == r {
	r1 = ToLower(r)
	}
	return r1
	}

	// caseOrbit is defined in tables.go as []foldPair. Right now all the
	// entries fit in uint16, so use uint16. If that changes, compilation
	// will fail (the constants in the composite literal will not fit in uint16)
	// and the types here can change to uint32.
	type foldPair struct {
	From uint16
	To uint16
	}

	// SimpleFold iterates over Unicode code points equivalent under
	// the Unicode-defined simple case folding. Among the code points
	// equivalent to rune (including rune itself), SimpleFold returns the
	// smallest rune > r if one exists, or else the smallest rune >= 0.
	// If r is not a valid Unicode code point, SimpleFold(r) returns r.
	//
	// For example:
	// SimpleFold('A') = 'a'
	// SimpleFold('a') = 'A'
	//
	// SimpleFold('K') = 'k'
	// SimpleFold('k') = '\u212A' (Kelvin symbol, K)
	// SimpleFold('\u212A') = 'K'
	//
	// SimpleFold('1') = '1'
	//
	// SimpleFold(-2) = -2
	//
	func SimpleFold(r rune) rune {
	if r < 0 \|\| r > MaxRune {
	return r
	}

	if int(r) < len(asciiFold) {
	return rune(asciiFold[r])
	}

	// Consult caseOrbit table for special cases.
	lo := 0
	hi := len(caseOrbit)
	for lo < hi {
	m := lo + (hi-lo)/2
	if rune(caseOrbit[m].From) < r {
	lo = m + 1
	} else {
	hi = m
	}
	}
	if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r {
	return rune(caseOrbit[lo].To)
	}

	// No folding specified. This is a one- or two-element
	// equivalence class containing rune and ToLower(rune)
	// and ToUpper(rune) if they are different from rune.
	if l := ToLower(r); l != r {
	return l
	}
	return ToUpper(r)
	}