blob: b78fffa0f8f1759616ff2596d639a83fcca5f044 [file] [log] [blame]
// Copyright 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package postgres
import (
"path/filepath"
"strings"
"unicode"
"github.com/russross/blackfriday/v2"
)
const (
maxSectionWords = 50
maxReadmeFraction = 0.5
)
// SearchDocumentSections computes the B and C sections of a Postgres search
// document from a package synopsis and a README.
// By "B section" and "C section" we mean the portion of the tsvector with weight
// "B" and "C", respectively.
//
// The B section consists of the synopsis.
// The C section consists of the first sentence of the README.
// The D section consists of the remainder of the README.
// All sections are split into words and processed for replacements.
// Each section is limited to maxSectionWords words, and in addition the
// D section is limited to an initial fraction of the README, determined
// by maxReadmeFraction.
func SearchDocumentSections(synopsis, readmeFilename, readme string) (b, c, d string) {
return searchDocumentSections(synopsis, readmeFilename, readme, maxSectionWords, maxReadmeFraction)
}
func searchDocumentSections(synopsis, readmeFilename, readme string, maxSecWords int, maxReadmeFrac float64) (b, c, d string) {
var readmeFirst, readmeRest string
if isMarkdown(readmeFilename) {
readme = processMarkdown(readme)
}
if i := sentenceEndIndex(readme); i > 0 {
readmeFirst, readmeRest = readme[:i+1], readme[i+1:]
} else {
readmeRest = readme
}
sw := processWords(synopsis)
rwf := processWords(readmeFirst)
rwr := processWords(readmeRest)
sectionB, _ := split(sw, maxSecWords)
sectionC, rwfd := split(rwf, maxSecWords)
// section D is the part of the readme that is not in sectionC.
rwd := append(rwfd, rwr...)
// Keep maxSecWords of section D, but not more than maxReadmeFrac.
f := int(maxReadmeFrac * float64(len(rwd)))
nkeep := maxSecWords
if nkeep > f {
nkeep = f
}
sectionD, _ := split(rwd, nkeep)
// If there is no synopsis, use first sentence of the README.
// But do not promote the rest of the README to section C.
if len(sectionB) == 0 {
sectionB = sectionC
sectionC = nil
}
prep := func(ws []string) string {
return makeValidUnicode(strings.Join(ws, " "))
}
return prep(sectionB), prep(sectionC), prep(sectionD)
}
// split splits a slice of strings into two parts. The first has length <= n,
// and the second is the rest of the slice. If n is negative, the first part is nil and
// the second part is the entire slice.
func split(a []string, n int) ([]string, []string) {
if n >= len(a) {
return a, nil
}
return a[:n], a[n:]
}
// sentenceEndIndex returns the index in s of the end of the first sentence, or
// -1 if no end can be found. A sentence ends at a '.', '!' or '?' that is
// followed by a space (or ends the string), and is not preceded by an
// uppercase letter.
func sentenceEndIndex(s string) int {
var prev1, prev2 rune
end := func() bool {
return !unicode.IsUpper(prev2) && (prev1 == '.' || prev1 == '!' || prev1 == '?')
}
for i, r := range s {
if unicode.IsSpace(r) && end() {
return i - 1
}
prev2 = prev1
prev1 = r
}
if end() {
return len(s) - 1
}
return -1
}
// processWords splits s into words at whitespace, then processes each word.
func processWords(s string) []string {
fields := strings.Fields(strings.ToLower(s))
var words []string
for _, f := range fields {
words = append(words, processWord(f)...)
}
return words
}
// summaryReplacements is used to replace words with other words.
// It is used by processWord, below.
// Example key-value pairs:
//
// "deleteMe": nil // removes "deleteMe"
// "rand": []string{"random"} // replace "rand" with "random"
// "utf-8": []string{"utf-8", "utf8"} // add "utf8" whenever "utf-8" is seen
var summaryReplacements = map[string][]string{
"postgres": {"postgres", "postgresql"},
"postgresql": {"postgres", "postgresql"},
"rand": {"random"},
"mongo": {"mongo", "mongodb"},
"mongodb": {"mongo", "mongodb"},
"redis": {"redis", "redisdb"},
"redisdb": {"redis", "redisdb"},
"logger": {"logger", "log"}, // Postgres stemmer does not handle -er
"parser": {"parser", "parse"},
"utf-8": {"utf-8", "utf8"},
}
// processWord performs processing on s, returning zero or more words.
// Its main purpose is to apply summaryReplacements to replace
// certain words with synonyms or additional search terms.
func processWord(s string) []string {
s = strings.TrimFunc(s, unicode.IsPunct)
if s == "" {
return nil
}
if rs, ok := summaryReplacements[s]; ok {
return rs
}
if !hyphenSplit(s) {
return []string{s}
}
// Apply replacements to parts of hyphenated words.
ws := strings.Split(s, "-")
if len(ws) == 1 {
return ws
}
result := []string{s} // Include the full hyphenated word.
for _, w := range ws {
if rs, ok := summaryReplacements[w]; ok {
result = append(result, rs...)
}
// We don't need to include the parts; the Postgres text-search processor will do that.
}
return result
}
// hyphenSplit reports whether s should be split on hyphens.
func hyphenSplit(s string) bool {
return !(strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://"))
}
// isMarkdown reports whether filename says that the file contains markdown.
func isMarkdown(filename string) bool {
ext := strings.ToLower(filepath.Ext(filename))
// https://tools.ietf.org/html/rfc7763 mentions both extensions.
return ext == ".md" || ext == ".markdown"
}
// processMarkdown returns the text of a markdown document.
// It omits all formatting and images.
func processMarkdown(s string) string {
parser := blackfriday.New(blackfriday.WithExtensions(blackfriday.CommonExtensions))
root := parser.Parse([]byte(s))
buf := walkMarkdown(root, nil, 0)
return string(buf)
}
// walkMarkdown traverses a blackfriday parse tree, extracting text.
func walkMarkdown(n *blackfriday.Node, buf []byte, level int) []byte {
if n == nil {
return buf
}
switch n.Type {
case blackfriday.Image:
// Skip images because they usually are irrelevant to the package
// (badges and such).
return buf
case blackfriday.CodeBlock:
// Skip code blocks because they have a wide variety of unrelated symbols.
return buf
case blackfriday.Paragraph, blackfriday.Heading:
if len(buf) > 0 {
buf = append(buf, ' ')
}
default:
buf = append(buf, n.Literal...)
}
for c := n.FirstChild; c != nil; c = c.Next {
buf = walkMarkdown(c, buf, level+1)
}
return buf
}