database/stem.go - gddo - Git at Google

 // Copyright 2013 The Go Authors. All rights reserved.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file or at
 // https://developers.google.com/open-source/licenses/bsd.

 // This file implements the Paice/Husk stemming algorithm.
 // http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm

 package database

 import (
 	"bytes"
 	"regexp"
 	"strconv"
 )

 const stemRuleText = `
 ai*2. a*1.
 bb1.
 city3s. ci2> cn1t>
 dd1. dei3y> deec2ss. dee1. de2> dooh4>
 e1>
 feil1v. fi2>
 gni3> gai3y. ga2> gg1.
 ht*2. hsiug5ct. hsi3>
 i*1. i1y>
 ji1d. juf1s. ju1d. jo1d. jeh1r. jrev1t. jsim2t. jn1d. j1s.
 lbaifi6. lbai4y. lba3> lbi3. lib2l> lc1. lufi4y. luf3> lu2. lai3> lau3> la2> ll1.
 mui3. mu*2. msi3> mm1.
 nois4j> noix4ct. noi3> nai3> na2> nee0. ne2> nn1.
 pihs4> pp1.
 re2> rae0. ra2. ro2> ru2> rr1. rt1> rei3y>
 sei3y> sis2. si2> ssen4> ss0. suo3> su*2. s*1> s0.
 tacilp4y. ta2> tnem4> tne3> tna3> tpir2b. tpro2b. tcud1. tpmus2. tpec2iv. tulo2v. tsis0. tsi3> tt1.
 uqi3. ugo1.
 vis3j> vie0. vi2>
 ylb1> yli3y> ylp0. yl2> ygo1. yhp1. ymo1. ypo1. yti3> yte3> ytl2. yrtsi5. yra3> yro3> yfi3. ycn2t> yca3>
 zi2> zy1s.
 `

 type stemRule struct {
 	text   string
 	suffix []byte
 	intact bool
 	remove int
 	append []byte
 	more   bool
 }

 func parseStemRules() map[byte][]*stemRule {

 	rules := make(map[byte][]*stemRule)
 	for _, m := range regexp.MustCompile(`(?m)(?:^| )([a-zA-Z]*)(\*?)([0-9])([a-zA-z]*)([.>])`).FindAllStringSubmatch(stemRuleText, -1) {

 		suffix := []byte(m[1])
 		for i := 0; i < len(suffix)/2; i++ {
 			j := len(suffix) - 1 - i
 			suffix[i], suffix[j] = suffix[j], suffix[i]
 		}

 		remove, _ := strconv.Atoi(m[3])
 		r := &stemRule{
 			text:   m[0],
 			suffix: suffix,
 			intact: m[2] == "*",
 			remove: remove,
 			append: []byte(m[4]),
 			more:   m[5] == ">",
 		}
 		c := suffix[len(suffix)-1]
 		rules[c] = append(rules[c], r)
 	}
 	return rules
 }

 var stemRules = parseStemRules()

 func firstVowel(offset int, p []byte) int {
 	for i, b := range p {
 		switch b {
 		case 'a', 'e', 'i', 'o', 'u':
 			return offset + i
 		case 'y':
 			if offset+i > 0 {
 				return offset + i
 			}
 		}
 	}
 	return -1
 }

 func acceptableStem(a, b []byte) bool {
 	i := firstVowel(0, a)
 	if i < 0 {
 		i = firstVowel(len(a), b)
 	}
 	l := len(a) + len(b)
 	if i == 0 {
 		return l > 1
 	}
 	return i >= 0 && l > 2
 }

 func stem(s string) string {
 	stem := bytes.ToLower([]byte(s))
 	intact := true
 	run := acceptableStem(stem, []byte{})
 	for run {
 		run = false
 		for _, rule := range stemRules[stem[len(stem)-1]] {
 			if bytes.HasSuffix(stem, rule.suffix) &&
 				(intact || !rule.intact) &&
 				acceptableStem(stem[:len(stem)-rule.remove], rule.append) {
 				stem = append(stem[:len(stem)-rule.remove], rule.append...)
 				intact = false
 				run = rule.more
 				break
 			}
 		}
 	}
 	return string(stem)
 }
	// Copyright 2013 The Go Authors. All rights reserved.
	//
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file or at
	// https://developers.google.com/open-source/licenses/bsd.

	// This file implements the Paice/Husk stemming algorithm.
	// http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm

	package database

	import (
	"bytes"
	"regexp"
	"strconv"
	)

	const stemRuleText = `
	ai2. a1.
	bb1.
	city3s. ci2> cn1t>
	dd1. dei3y> deec2ss. dee1. de2> dooh4>
	e1>
	feil1v. fi2>
	gni3> gai3y. ga2> gg1.
	ht*2. hsiug5ct. hsi3>
	i*1. i1y>
	ji1d. juf1s. ju1d. jo1d. jeh1r. jrev1t. jsim2t. jn1d. j1s.
	lbaifi6. lbai4y. lba3> lbi3. lib2l> lc1. lufi4y. luf3> lu2. lai3> lau3> la2> ll1.
	mui3. mu*2. msi3> mm1.
	nois4j> noix4ct. noi3> nai3> na2> nee0. ne2> nn1.
	pihs4> pp1.
	re2> rae0. ra2. ro2> ru2> rr1. rt1> rei3y>
	sei3y> sis2. si2> ssen4> ss0. suo3> su2. s1> s0.
	tacilp4y. ta2> tnem4> tne3> tna3> tpir2b. tpro2b. tcud1. tpmus2. tpec2iv. tulo2v. tsis0. tsi3> tt1.
	uqi3. ugo1.
	vis3j> vie0. vi2>
	ylb1> yli3y> ylp0. yl2> ygo1. yhp1. ymo1. ypo1. yti3> yte3> ytl2. yrtsi5. yra3> yro3> yfi3. ycn2t> yca3>
	zi2> zy1s.
	`

	type stemRule struct {
	text string
	suffix []byte
	intact bool
	remove int
	append []byte
	more bool
	}

	func parseStemRules() map[byte][]*stemRule {

	rules := make(map[byte][]*stemRule)
	for _, m := range regexp.MustCompile(`(?m)(?:^\| )([a-zA-Z])(\?)([0-9])([a-zA-z]*)([.>])`).FindAllStringSubmatch(stemRuleText, -1) {

	suffix := []byte(m[1])
	for i := 0; i < len(suffix)/2; i++ {
	j := len(suffix) - 1 - i
	suffix[i], suffix[j] = suffix[j], suffix[i]
	}

	remove, _ := strconv.Atoi(m[3])
	r := &stemRule{
	text: m[0],
	suffix: suffix,
	intact: m[2] == "*",
	remove: remove,
	append: []byte(m[4]),
	more: m[5] == ">",
	}
	c := suffix[len(suffix)-1]
	rules[c] = append(rules[c], r)
	}
	return rules
	}

	var stemRules = parseStemRules()

	func firstVowel(offset int, p []byte) int {
	for i, b := range p {
	switch b {
	case 'a', 'e', 'i', 'o', 'u':
	return offset + i
	case 'y':
	if offset+i > 0 {
	return offset + i
	}
	}
	}
	return -1
	}

	func acceptableStem(a, b []byte) bool {
	i := firstVowel(0, a)
	if i < 0 {
	i = firstVowel(len(a), b)
	}
	l := len(a) + len(b)
	if i == 0 {
	return l > 1
	}
	return i >= 0 && l > 2
	}

	func stem(s string) string {
	stem := bytes.ToLower([]byte(s))
	intact := true
	run := acceptableStem(stem, []byte{})
	for run {
	run = false
	for _, rule := range stemRules[stem[len(stem)-1]] {
	if bytes.HasSuffix(stem, rule.suffix) &&
	(intact \|\| !rule.intact) &&
	acceptableStem(stem[:len(stem)-rule.remove], rule.append) {
	stem = append(stem[:len(stem)-rule.remove], rule.append...)
	intact = false
	run = rule.more
	break
	}
	}
	}
	return string(stem)
	}