blob: c443b78d82554e08a6e162c38b02137886bad3b9 [file] [log] [blame]
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
// This file contains Form-specific logic and wrappers for data in tables.go.
// Rune info is stored in a separate trie per composing form. A composing form
// and its corresponding decomposing form share the same trie. Each trie maps
// a rune to a uint16. The values take two forms. For v >= 0x8000:
// bits
// 0..8: ccc
// 9..12: qcInfo (see below). isYesD is always true (no decompostion).
// 16: 1
// For v < 0x8000, the respective rune has a decomposition and v is an index
// into a byte array of UTF-8 decomposition sequences and additional info and
// has the form:
// <header> <decomp_byte>* [<tccc> [<lccc>]]
// The header contains the number of bytes in the decomposition (excluding this
// length byte). The two most significant bits of this length byte correspond
// to bit 2 and 3 of qcIfo (see below). The byte sequence itself starts at v+1.
// The byte sequence is followed by a trailing and leading CCC if the values
// for these are not zero. The value of v determines which ccc are appended
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
// there is an additional leading ccc.
const (
qcInfoMask = 0xF // to clear all but the relevant bits in a qcInfo
headerLenMask = 0x3F // extract the length value from the header byte
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
)
// runeInfo is a representation for the data stored in charinfoTrie.
type runeInfo struct {
pos uint8 // start position in reorderBuffer; used in composition.go
size uint8 // length of UTF-8 encoding of this rune
ccc uint8 // leading canonical combining class (ccc if not decomposition)
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
flags qcInfo // quick check flags
index uint16
}
// functions dispatchable per form
type lookupFunc func(b input, i int) runeInfo
// formInfo holds Form-specific functions and tables.
type formInfo struct {
form Form
composing, compatibility bool // form type
info lookupFunc
}
var formTable []*formInfo
func init() {
formTable = make([]*formInfo, 4)
for i := range formTable {
f := &formInfo{}
formTable[i] = f
f.form = Form(i)
if Form(i) == NFKD || Form(i) == NFKC {
f.compatibility = true
f.info = lookupInfoNFKC
} else {
f.info = lookupInfoNFC
}
if Form(i) == NFC || Form(i) == NFKC {
f.composing = true
}
}
}
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
// unexpected behavior for the user. For example, in NFD, there is a boundary
// after 'a'. However, a might combine with modifiers, so from the application's
// perspective it is not a good boundary. We will therefore always use the
// boundaries for the combining variants.
func (i runeInfo) boundaryBefore() bool {
if i.ccc == 0 && !i.combinesBackward() {
return true
}
// We assume that the CCC of the first character in a decomposition
// is always non-zero if different from info.ccc and that we can return
// false at this point. This is verified by maketables.
return false
}
func (i runeInfo) boundaryAfter() bool {
return i.isInert()
}
// We pack quick check data in 4 bits:
// 0: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
// 1..2: NFC_QC Yes(00), No (10), or Maybe (11)
// 3: Combines forward (0 == false, 1 == true)
//
// When all 4 bits are zero, the character is inert, meaning it is never
// influenced by normalization.
type qcInfo uint8
func (i runeInfo) isYesC() bool { return i.flags&0x4 == 0 }
func (i runeInfo) isYesD() bool { return i.flags&0x1 == 0 }
func (i runeInfo) combinesForward() bool { return i.flags&0x8 != 0 }
func (i runeInfo) combinesBackward() bool { return i.flags&0x2 != 0 } // == isMaybe
func (i runeInfo) hasDecomposition() bool { return i.flags&0x1 != 0 } // == isNoD
func (r runeInfo) isInert() bool {
return r.flags&0xf == 0 && r.ccc == 0
}
func (r runeInfo) decomposition() []byte {
if r.index == 0 {
return nil
}
p := r.index
n := decomps[p] & 0x3F
p++
return decomps[p : p+uint16(n)]
}
// Recomposition
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
// This clips off the bits of three entries, but we know this will not
// result in a collision. In the unlikely event that changes to
// UnicodeData.txt introduce collisions, the compiler will catch it.
// Note that the recomposition map for NFC and NFKC are identical.
// combine returns the combined rune or 0 if it doesn't exist.
func combine(a, b rune) rune {
key := uint32(uint16(a))<<16 + uint32(uint16(b))
return recompMap[key]
}
func lookupInfoNFC(b input, i int) runeInfo {
v, sz := b.charinfoNFC(i)
return compInfo(v, sz)
}
func lookupInfoNFKC(b input, i int) runeInfo {
v, sz := b.charinfoNFKC(i)
return compInfo(v, sz)
}
// compInfo converts the information contained in v and sz
// to a runeInfo. See the comment at the top of the file
// for more information on the format.
func compInfo(v uint16, sz int) runeInfo {
if v == 0 {
return runeInfo{size: uint8(sz)}
} else if v >= 0x8000 {
return runeInfo{
size: uint8(sz),
ccc: uint8(v),
tccc: uint8(v),
flags: qcInfo(v>>8) & qcInfoMask,
}
}
// has decomposition
h := decomps[v]
f := (qcInfo(h&headerFlagsMask) >> 4) | 0x1
ri := runeInfo{size: uint8(sz), flags: f, index: v}
if v >= firstCCC {
v += uint16(h&headerLenMask) + 1
ri.tccc = decomps[v]
if v >= firstLeadingCCC {
ri.ccc = decomps[v+1]
}
}
return ri
}