| // Copyright 2014 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package cases |
| |
| // This file contains the definitions of case mappings for all supported |
| // languages. The rules for the language-specific tailorings were taken and |
| // modified from the CLDR transform definitions in common/transforms. |
| |
| import ( |
| "strings" |
| "unicode" |
| "unicode/utf8" |
| |
| "golang.org/x/text/language" |
| "golang.org/x/text/transform" |
| "golang.org/x/text/unicode/norm" |
| ) |
| |
| // A mapFunc takes a context set to the current rune and writes the mapped |
| // version to the same context. It may advance the context to the next rune. It |
| // returns whether a checkpoint is possible: whether the pDst bytes written to |
| // dst so far won't need changing as we see more source bytes. |
| type mapFunc func(*context) bool |
| |
| // maxIgnorable defines the maximum number of ignorables to consider for |
| // lookahead operations. |
| const maxIgnorable = 30 |
| |
| // supported lists the language tags for which we have tailorings. |
| const supported = "und af az el lt nl tr" |
| |
| func init() { |
| tags := []language.Tag{} |
| for _, s := range strings.Split(supported, " ") { |
| tags = append(tags, language.MustParse(s)) |
| } |
| matcher = language.NewMatcher(tags) |
| Supported = language.NewCoverage(tags) |
| } |
| |
| var ( |
| matcher language.Matcher |
| |
| Supported language.Coverage |
| |
| // We keep the following lists separate, instead of having a single per- |
| // language struct, to give the compiler a chance to remove unused code. |
| |
| // Some uppercase mappers are stateless, so we can precompute the |
| // Transformers and save a bit on runtime allocations. |
| upperFunc = []mapFunc{ |
| nil, // und |
| nil, // af |
| aztrUpper(upper), // az |
| elUpper, // el |
| ltUpper(upper), // lt |
| nil, // nl |
| aztrUpper(upper), // tr |
| } |
| |
| undUpper transform.Transformer = &undUpperCaser{} |
| |
| lowerFunc = []mapFunc{ |
| lower, // und |
| lower, // af |
| aztrLower, // az |
| lower, // el |
| ltLower, // lt |
| lower, // nl |
| aztrLower, // tr |
| } |
| |
| titleInfos = []struct { |
| title, lower mapFunc |
| rewrite func(*context) |
| }{ |
| {title, lower, nil}, // und |
| {title, lower, afnlRewrite}, // af |
| {aztrUpper(title), aztrLower, nil}, // az |
| {title, lower, nil}, // el |
| {ltUpper(title), ltLower, nil}, // lt |
| {nlTitle, lower, afnlRewrite}, // nl |
| {aztrUpper(title), aztrLower, nil}, // tr |
| } |
| ) |
| |
| func makeUpper(t language.Tag, o options) transform.Transformer { |
| _, i, _ := matcher.Match(t) |
| f := upperFunc[i] |
| if f == nil { |
| return undUpper |
| } |
| return &simpleCaser{f: f} |
| } |
| |
| func makeLower(t language.Tag, o options) transform.Transformer { |
| _, i, _ := matcher.Match(t) |
| f := lowerFunc[i] |
| if o.noFinalSigma { |
| return &simpleCaser{f: f} |
| } |
| return &lowerCaser{ |
| first: f, |
| midWord: finalSigma(f), |
| } |
| } |
| |
| func makeTitle(t language.Tag, o options) transform.Transformer { |
| _, i, _ := matcher.Match(t) |
| x := &titleInfos[i] |
| lower := x.lower |
| if o.noLower { |
| lower = (*context).copy |
| } else if !o.noFinalSigma { |
| lower = finalSigma(lower) |
| } |
| return &titleCaser{ |
| title: x.title, |
| lower: lower, |
| rewrite: x.rewrite, |
| } |
| } |
| |
| // TODO: consider a similar special case for the fast majority lower case. This |
| // is a bit more involved so will require some more precise benchmarking to |
| // justify it. |
| |
| type undUpperCaser struct{ transform.NopResetter } |
| |
| // undUpperCaser implements the Transformer interface for doing an upper case |
| // mapping for the root locale (und). It eliminates the need for an allocation |
| // as it prevents escaping by not using function pointers. |
| func (t *undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| c := context{dst: dst, src: src, atEOF: atEOF} |
| for c.next() { |
| upper(&c) |
| c.checkpoint() |
| } |
| return c.ret() |
| } |
| |
| type simpleCaser struct { |
| context |
| f mapFunc |
| } |
| |
| // simpleCaser implements the Transformer interface for doing a case operation |
| // on a rune-by-rune basis. |
| func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| t.context = context{dst: dst, src: src, atEOF: atEOF} |
| c := &t.context |
| for c.next() && t.f(c) { |
| c.checkpoint() |
| } |
| return c.ret() |
| } |
| |
| // lowerCaser implements the Transformer interface. The default Unicode lower |
| // casing requires different treatment for the first and subsequent characters |
| // of a word, most notably to handle the Greek final Sigma. |
| type lowerCaser struct { |
| context |
| |
| first, midWord mapFunc |
| } |
| |
| func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| t.context = context{dst: dst, src: src, atEOF: atEOF} |
| c := &t.context |
| |
| for isInterWord := true; c.next(); { |
| if isInterWord { |
| if c.info.isCased() { |
| if !t.first(c) { |
| break |
| } |
| isInterWord = false |
| } else if !c.copy() { |
| break |
| } |
| } else { |
| if c.info.isNotCasedAndNotCaseIgnorable() { |
| if !c.copy() { |
| break |
| } |
| isInterWord = true |
| } else if !t.midWord(c) { |
| break |
| } |
| } |
| c.checkpoint() |
| } |
| return c.ret() |
| } |
| |
| // titleCaser implements the Transformer interface. Title casing algorithms |
| // distinguish between the first letter of a word and subsequent letters of the |
| // same word. It uses state to avoid requiring a potentially infinite lookahead. |
| type titleCaser struct { |
| context |
| |
| // rune mappings used by the actual casing algorithms. |
| title, lower mapFunc |
| |
| rewrite func(*context) |
| } |
| |
| // Transform implements the standard Unicode title case algorithm as defined in |
| // Chapter 3 of The Unicode Standard: |
| // toTitlecase(X): Find the word boundaries in X according to Unicode Standard |
| // Annex #29, "Unicode Text Segmentation." For each word boundary, find the |
| // first cased character F following the word boundary. If F exists, map F to |
| // Titlecase_Mapping(F); then map all characters C between F and the following |
| // word boundary to Lowercase_Mapping(C). |
| func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord} |
| c := &t.context |
| |
| if !c.next() { |
| return c.ret() |
| } |
| |
| for { |
| p := c.info |
| if t.rewrite != nil { |
| t.rewrite(c) |
| } |
| |
| wasMid := p.isCaseIgnorableAndNonBreakStarter() |
| // Break out of this loop on failure to ensure we do not modify the |
| // state incorrectly. |
| if p.isCased() && !p.isCaseIgnorableAndNotCased() { |
| if !c.isMidWord { |
| if !t.title(c) { |
| break |
| } |
| c.isMidWord = true |
| } else if !t.lower(c) { |
| break |
| } |
| } else if !c.copy() { |
| break |
| } |
| |
| // TODO: make this an "else if" if we can prove that no rune that does |
| // not match the first condition of the if statement can be a break. |
| if p.isBreak() { |
| c.isMidWord = false |
| } |
| |
| // As we save the state of the transformer, it is safe to call |
| // checkpoint after any successful write. |
| c.checkpoint() |
| |
| if !c.next() { |
| break |
| } |
| if wasMid && c.info.isCaseIgnorableAndNonBreakStarter() { |
| c.isMidWord = false |
| } |
| } |
| return c.ret() |
| } |
| |
| // lower writes the lowercase version of the current rune to dst. |
| func lower(c *context) bool { |
| if c.info&hasMappingMask == 0 || c.caseType() == cLower { |
| return c.copy() |
| } |
| if c.info&exceptionBit == 0 { |
| return c.copyXOR() |
| } |
| e := exceptions[c.info>>exceptionShift+1:] |
| if nLower := (e[0] >> lengthBits) & lengthMask; nLower != noChange { |
| return c.writeString(e[1 : 1+nLower]) |
| } |
| return c.copy() |
| } |
| |
| // upper writes the uppercase version of the current rune to dst. |
| func upper(c *context) bool { |
| ct := c.caseType() |
| if c.info&hasMappingMask == 0 || ct == cUpper { |
| return c.copy() |
| } |
| if c.info&exceptionBit == 0 { |
| return c.copyXOR() |
| } |
| e := exceptions[c.info>>exceptionShift+1:] |
| // Get length of first special case mapping. |
| n := (e[0] >> lengthBits) & lengthMask |
| if ct == cTitle { |
| // The first special case mapping is for lower. Set n to the second. |
| if n == noChange { |
| n = 0 |
| } |
| n, e = e[0]&lengthMask, e[n:] |
| } |
| if n != noChange { |
| return c.writeString(e[1 : 1+n]) |
| } |
| return c.copy() |
| } |
| |
| // title writes the title case version of the current rune to dst. |
| func title(c *context) bool { |
| ct := c.caseType() |
| if c.info&hasMappingMask == 0 || ct == cTitle { |
| return c.copy() |
| } |
| if c.info&exceptionBit == 0 { |
| if ct == cLower { |
| return c.copyXOR() |
| } |
| return c.copy() |
| } |
| // Get the exception data. |
| e := exceptions[c.info>>exceptionShift+1:] |
| |
| nFirst := (e[0] >> lengthBits) & lengthMask |
| if nTitle := e[0] & lengthMask; nTitle != noChange { |
| if nFirst != noChange { |
| e = e[nFirst:] |
| } |
| return c.writeString(e[1 : 1+nTitle]) |
| } |
| if ct == cLower && nFirst != noChange { |
| // Use the uppercase version instead. |
| return c.writeString(e[1 : 1+nFirst]) |
| } |
| // Already in correct case. |
| return c.copy() |
| } |
| |
| // finalSigma adds Greek final Sigma handing to another casing function. It |
| // determines whether a lowercased sigma should be σ or ς, by looking ahead for |
| // case-ignorables and a cased letters. |
| func finalSigma(f mapFunc) mapFunc { |
| return func(c *context) bool { |
| // ::NFD(); |
| // # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA |
| // Σ } [:case-ignorable:]* [:cased:] → σ; |
| // [:cased:] [:case-ignorable:]* { Σ → ς; |
| // ::Any-Lower; |
| // ::NFC(); |
| |
| if !c.hasPrefix("Σ") { |
| return f(c) |
| } |
| |
| p := c.pDst |
| c.writeString("ς") |
| // We need to do one more iteration after maxIgnorable, as a cased |
| // letter is not an ignorable and may modify the result. |
| for i := 0; i < maxIgnorable+1; i++ { |
| if !c.next() { |
| return false |
| } |
| if !c.info.isCaseIgnorable() { |
| if c.info.isCased() { |
| // p+1 is guaranteed to be in bounds: if writing ς was |
| // successful, p+1 will contain the second byte of ς. If not, |
| // this function will have returned after c.next returned false. |
| c.dst[p+1]++ // ς → σ |
| } |
| c.unreadRune() |
| return true |
| } |
| // A case ignorable may also introduce a word break, so we may need |
| // to continue searching even after detecting a break. |
| c.isMidWord = c.isMidWord && !c.info.isBreak() |
| c.copy() |
| } |
| return true |
| } |
| } |
| |
| // elUpper implements Greek upper casing, which entails removing a predefined |
| // set of non-blocked modifiers. Note that these accents should not be removed |
| // for title casing! |
| // Example: "Οδός" -> "ΟΔΟΣ". |
| func elUpper(c *context) bool { |
| // From CLDR: |
| // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ; |
| // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ; |
| |
| r, _ := utf8.DecodeRune(c.src[c.pSrc:]) |
| oldPDst := c.pDst |
| if !upper(c) { |
| return false |
| } |
| if !unicode.Is(unicode.Greek, r) { |
| return true |
| } |
| i := 0 |
| // Take the properties of the uppercased rune that is already written to the |
| // destination. This saves us the trouble of having to uppercase the |
| // decomposed rune again. |
| if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil { |
| // Restore the destination position and process the decomposed rune. |
| r, sz := utf8.DecodeRune(b) |
| if r <= 0xFF { // See A.6.1 |
| return true |
| } |
| c.pDst = oldPDst |
| // Insert the first rune and ignore the modifiers. See A.6.2. |
| c.writeBytes(b[:sz]) |
| i = len(b[sz:]) / 2 // Greek modifiers are always of length 2. |
| } |
| |
| for ; i < maxIgnorable && c.next(); i++ { |
| switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r { |
| // Above and Iota Subscript |
| case 0x0300, // U+0300 COMBINING GRAVE ACCENT |
| 0x0301, // U+0301 COMBINING ACUTE ACCENT |
| 0x0304, // U+0304 COMBINING MACRON |
| 0x0306, // U+0306 COMBINING BREVE |
| 0x0308, // U+0308 COMBINING DIAERESIS |
| 0x0313, // U+0313 COMBINING COMMA ABOVE |
| 0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE |
| 0x0342, // U+0342 COMBINING GREEK PERISPOMENI |
| 0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI |
| // No-op. Gobble the modifier. |
| |
| default: |
| switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() { |
| case cccZero: |
| c.unreadRune() |
| return true |
| |
| // We don't need to test for IotaSubscript as the only rune that |
| // qualifies (U+0345) was already excluded in the switch statement |
| // above. See A.4. |
| |
| case cccAbove: |
| return c.copy() |
| default: |
| // Some other modifier. We're still allowed to gobble Greek |
| // modifiers after this. |
| c.copy() |
| } |
| } |
| } |
| return i == maxIgnorable |
| } |
| |
| func ltLower(c *context) bool { |
| // From CLDR: |
| // # Introduce an explicit dot above when lowercasing capital I's and J's |
| // # whenever there are more accents above. |
| // # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) |
| // # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I |
| // # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J |
| // # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK |
| // # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE |
| // # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE |
| // # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE |
| // ::NFD(); |
| // I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307; |
| // J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307; |
| // Į } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → į \u0307; |
| // Ì → i \u0307 \u0300; |
| // Í → i \u0307 \u0301; |
| // Ĩ → i \u0307 \u0303; |
| // ::Any-Lower(); |
| // ::NFC(); |
| |
| i := 0 |
| if r := c.src[c.pSrc]; r < utf8.RuneSelf { |
| lower(c) |
| if r != 'I' && r != 'J' { |
| return true |
| } |
| } else { |
| p := norm.NFD.Properties(c.src[c.pSrc:]) |
| if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') { |
| // UTF-8 optimization: the decomposition will only have an above |
| // modifier if the last rune of the decomposition is in [U+300-U+311]. |
| // In all other cases, a decomposition starting with I is always |
| // an I followed by modifiers that are not cased themselves. See A.2. |
| if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4. |
| if !c.writeBytes(d[:1]) { |
| return false |
| } |
| c.dst[c.pDst-1] += 'a' - 'A' // lower |
| |
| // Assumption: modifier never changes on lowercase. See A.1. |
| // Assumption: all modifiers added have CCC = Above. See A.2.3. |
| return c.writeString("\u0307") && c.writeBytes(d[1:]) |
| } |
| // In all other cases the additional modifiers will have a CCC |
| // that is less than 230 (Above). We will insert the U+0307, if |
| // needed, after these modifiers so that a string in FCD form |
| // will remain so. See A.2.2. |
| lower(c) |
| i = 1 |
| } else { |
| return lower(c) |
| } |
| } |
| |
| for ; i < maxIgnorable && c.next(); i++ { |
| switch c.info.cccType() { |
| case cccZero: |
| c.unreadRune() |
| return true |
| case cccAbove: |
| return c.writeString("\u0307") && c.copy() // See A.1. |
| default: |
| c.copy() // See A.1. |
| } |
| } |
| return i == maxIgnorable |
| } |
| |
| func ltUpper(f mapFunc) mapFunc { |
| return func(c *context) bool { |
| // From CLDR: |
| // ::NFD(); |
| // [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ; |
| // ::Any-Upper(); |
| // ::NFC(); |
| |
| // TODO: See A.5. A soft-dotted rune never has an exception. This would |
| // allow us to overload the exception bit and encode this property in |
| // info. Need to measure performance impact of this. |
| r, _ := utf8.DecodeRune(c.src[c.pSrc:]) |
| oldPDst := c.pDst |
| if !f(c) { |
| return false |
| } |
| if !unicode.Is(unicode.Soft_Dotted, r) { |
| return true |
| } |
| |
| // We don't need to do an NFD normalization, as a soft-dotted rune never |
| // contains U+0307. See A.3. |
| |
| i := 0 |
| for ; i < maxIgnorable && c.next(); i++ { |
| switch c.info.cccType() { |
| case cccZero: |
| c.unreadRune() |
| return true |
| case cccAbove: |
| if c.hasPrefix("\u0307") { |
| // We don't do a full NFC, but rather combine runes for |
| // some of the common cases. (Returning NFC or |
| // preserving normal form is neither a requirement nor |
| // a possibility anyway). |
| if !c.next() { |
| return false |
| } |
| if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc { |
| s := "" |
| switch c.src[c.pSrc+1] { |
| case 0x80: // U+0300 COMBINING GRAVE ACCENT |
| s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE |
| case 0x81: // U+0301 COMBINING ACUTE ACCENT |
| s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE |
| case 0x83: // U+0303 COMBINING TILDE |
| s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE |
| case 0x88: // U+0308 COMBINING DIAERESIS |
| s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS |
| default: |
| } |
| if s != "" { |
| c.pDst = oldPDst |
| return c.writeString(s) |
| } |
| } |
| } |
| return c.copy() |
| default: |
| c.copy() |
| } |
| } |
| return i == maxIgnorable |
| } |
| } |
| |
| func aztrUpper(f mapFunc) mapFunc { |
| return func(c *context) bool { |
| // i→İ; |
| if c.src[c.pSrc] == 'i' { |
| return c.writeString("İ") |
| } |
| return f(c) |
| } |
| } |
| |
| func aztrLower(c *context) (done bool) { |
| // From CLDR: |
| // # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri |
| // # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE |
| // İ→i; |
| // # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. |
| // # This matches the behavior of the canonically equivalent I-dot_above |
| // # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE |
| // # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. |
| // # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I |
| // I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ; |
| // I→ı ; |
| // ::Any-Lower(); |
| if c.hasPrefix("\u0130") { // İ |
| return c.writeString("i") |
| } |
| if c.src[c.pSrc] != 'I' { |
| return lower(c) |
| } |
| |
| // We ignore the lower-case I for now, but insert it later when we know |
| // which form we need. |
| start := c.pSrc + c.sz |
| |
| i := 0 |
| Loop: |
| // We check for up to n ignorables before \u0307. As \u0307 is an |
| // ignorable as well, n is maxIgnorable-1. |
| for ; i < maxIgnorable && c.next(); i++ { |
| switch c.info.cccType() { |
| case cccAbove: |
| if c.hasPrefix("\u0307") { |
| return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307 |
| } |
| done = true |
| break Loop |
| case cccZero: |
| c.unreadRune() |
| done = true |
| break Loop |
| default: |
| // We'll write this rune after we know which starter to use. |
| } |
| } |
| if i == maxIgnorable { |
| done = true |
| } |
| return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done |
| } |
| |
| func nlTitle(c *context) bool { |
| // From CLDR: |
| // # Special titlecasing for Dutch initial "ij". |
| // ::Any-Title(); |
| // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29) |
| // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ; |
| if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' { |
| return title(c) |
| } |
| |
| if !c.writeString("I") || !c.next() { |
| return false |
| } |
| if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' { |
| return c.writeString("J") |
| } |
| c.unreadRune() |
| return true |
| } |
| |
| // Not part of CLDR, but see http://unicode.org/cldr/trac/ticket/7078. |
| func afnlRewrite(c *context) { |
| if c.hasPrefix("'") || c.hasPrefix("’") { |
| c.isMidWord = true |
| } |
| } |