| // Copyright 2013 The Go Authors. All rights reserved. |
| // |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file or at |
| // https://developers.google.com/open-source/licenses/bsd. |
| |
| // This file implements the Paice/Husk stemming algorithm. |
| // http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm |
| |
| package database |
| |
| import ( |
| "bytes" |
| "regexp" |
| "strconv" |
| ) |
| |
| const stemRuleText = ` |
| ai*2. a*1. |
| bb1. |
| city3s. ci2> cn1t> |
| dd1. dei3y> deec2ss. dee1. de2> dooh4> |
| e1> |
| feil1v. fi2> |
| gni3> gai3y. ga2> gg1. |
| ht*2. hsiug5ct. hsi3> |
| i*1. i1y> |
| ji1d. juf1s. ju1d. jo1d. jeh1r. jrev1t. jsim2t. jn1d. j1s. |
| lbaifi6. lbai4y. lba3> lbi3. lib2l> lc1. lufi4y. luf3> lu2. lai3> lau3> la2> ll1. |
| mui3. mu*2. msi3> mm1. |
| nois4j> noix4ct. noi3> nai3> na2> nee0. ne2> nn1. |
| pihs4> pp1. |
| re2> rae0. ra2. ro2> ru2> rr1. rt1> rei3y> |
| sei3y> sis2. si2> ssen4> ss0. suo3> su*2. s*1> s0. |
| tacilp4y. ta2> tnem4> tne3> tna3> tpir2b. tpro2b. tcud1. tpmus2. tpec2iv. tulo2v. tsis0. tsi3> tt1. |
| uqi3. ugo1. |
| vis3j> vie0. vi2> |
| ylb1> yli3y> ylp0. yl2> ygo1. yhp1. ymo1. ypo1. yti3> yte3> ytl2. yrtsi5. yra3> yro3> yfi3. ycn2t> yca3> |
| zi2> zy1s. |
| ` |
| |
| type stemRule struct { |
| text string |
| suffix []byte |
| intact bool |
| remove int |
| append []byte |
| more bool |
| } |
| |
| func parseStemRules() map[byte][]*stemRule { |
| |
| rules := make(map[byte][]*stemRule) |
| for _, m := range regexp.MustCompile(`(?m)(?:^| )([a-zA-Z]*)(\*?)([0-9])([a-zA-z]*)([.>])`).FindAllStringSubmatch(stemRuleText, -1) { |
| |
| suffix := []byte(m[1]) |
| for i := 0; i < len(suffix)/2; i++ { |
| j := len(suffix) - 1 - i |
| suffix[i], suffix[j] = suffix[j], suffix[i] |
| } |
| |
| remove, _ := strconv.Atoi(m[3]) |
| r := &stemRule{ |
| text: m[0], |
| suffix: suffix, |
| intact: m[2] == "*", |
| remove: remove, |
| append: []byte(m[4]), |
| more: m[5] == ">", |
| } |
| c := suffix[len(suffix)-1] |
| rules[c] = append(rules[c], r) |
| } |
| return rules |
| } |
| |
| var stemRules = parseStemRules() |
| |
| func firstVowel(offset int, p []byte) int { |
| for i, b := range p { |
| switch b { |
| case 'a', 'e', 'i', 'o', 'u': |
| return offset + i |
| case 'y': |
| if offset+i > 0 { |
| return offset + i |
| } |
| } |
| } |
| return -1 |
| } |
| |
| func acceptableStem(a, b []byte) bool { |
| i := firstVowel(0, a) |
| if i < 0 { |
| i = firstVowel(len(a), b) |
| } |
| l := len(a) + len(b) |
| if i == 0 { |
| return l > 1 |
| } |
| return i >= 0 && l > 2 |
| } |
| |
| func stem(s string) string { |
| stem := bytes.ToLower([]byte(s)) |
| intact := true |
| run := acceptableStem(stem, []byte{}) |
| for run { |
| run = false |
| for _, rule := range stemRules[stem[len(stem)-1]] { |
| if bytes.HasSuffix(stem, rule.suffix) && |
| (intact || !rule.intact) && |
| acceptableStem(stem[:len(stem)-rule.remove], rule.append) { |
| stem = append(stem[:len(stem)-rule.remove], rule.append...) |
| intact = false |
| run = rule.more |
| break |
| } |
| } |
| } |
| return string(stem) |
| } |