internal/postgres/searchdoc.go - pkgsite - Git at Google

 // Copyright 2020 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package postgres

 import (
 	"path/filepath"
 	"strings"
 	"unicode"

 	"github.com/russross/blackfriday/v2"
 )

 const (
 	maxSectionWords   = 50
 	maxReadmeFraction = 0.5
 )

 // SearchDocumentSections computes the B and C sections of a Postgres search
 // document from a package synopsis and a README.
 // By "B section" and "C section" we mean the portion of the tsvector with weight
 // "B" and "C", respectively.
 //
 // The B section consists of the synopsis.
 // The C section consists of the first sentence of the README.
 // The D section consists of the remainder of the README.
 // All sections are split into words and processed for replacements.
 // Each section is limited to maxSectionWords words, and in addition the
 // D section is limited to an initial fraction of the README, determined
 // by maxReadmeFraction.
 func SearchDocumentSections(synopsis, readmeFilename, readme string) (b, c, d string) {
 	return searchDocumentSections(synopsis, readmeFilename, readme, maxSectionWords, maxReadmeFraction)
 }

 func searchDocumentSections(synopsis, readmeFilename, readme string, maxSecWords int, maxReadmeFrac float64) (b, c, d string) {
 	var readmeFirst, readmeRest string
 	if isMarkdown(readmeFilename) {
 		readme = processMarkdown(readme)
 	}
 	if i := sentenceEndIndex(readme); i > 0 {
 		readmeFirst, readmeRest = readme[:i+1], readme[i+1:]
 	} else {
 		readmeRest = readme
 	}
 	sw := processWords(synopsis)
 	rwf := processWords(readmeFirst)
 	rwr := processWords(readmeRest)

 	sectionB, _ := split(sw, maxSecWords)
 	sectionC, rwfd := split(rwf, maxSecWords)
 	// section D is the part of the readme that is not in sectionC.
 	rwd := append(rwfd, rwr...)
 	// Keep maxSecWords of section D, but not more than maxReadmeFrac.
 	f := int(maxReadmeFrac * float64(len(rwd)))
 	nkeep := maxSecWords
 	if nkeep > f {
 		nkeep = f
 	}
 	sectionD, _ := split(rwd, nkeep)

 	// If there is no synopsis, use first sentence of the README.
 	// But do not promote the rest of the README to section C.
 	if len(sectionB) == 0 {
 		sectionB = sectionC
 		sectionC = nil
 	}

 	prep := func(ws []string) string {
 		return makeValidUnicode(strings.Join(ws, " "))
 	}

 	return prep(sectionB), prep(sectionC), prep(sectionD)
 }

 // split splits a slice of strings into two parts. The first has length <= n,
 // and the second is the rest of the slice. If n is negative, the first part is nil and
 // the second part is the entire slice.
 func split(a []string, n int) ([]string, []string) {
 	if n >= len(a) {
 		return a, nil
 	}
 	return a[:n], a[n:]
 }

 // sentenceEndIndex returns the index in s of the end of the first sentence, or
 // -1 if no end can be found. A sentence ends at a '.', '!' or '?' that is
 // followed by a space (or ends the string), and is not preceded by an
 // uppercase letter.
 func sentenceEndIndex(s string) int {
 	var prev1, prev2 rune

 	end := func() bool {
 		return !unicode.IsUpper(prev2) && (prev1 == '.' || prev1 == '!' || prev1 == '?')
 	}

 	for i, r := range s {
 		if unicode.IsSpace(r) && end() {
 			return i - 1
 		}
 		prev2 = prev1
 		prev1 = r
 	}
 	if end() {
 		return len(s) - 1
 	}
 	return -1
 }

 // processWords splits s into words at whitespace, then processes each word.
 func processWords(s string) []string {
 	fields := strings.Fields(strings.ToLower(s))
 	var words []string
 	for _, f := range fields {
 		words = append(words, processWord(f)...)
 	}
 	return words
 }

 // summaryReplacements is used to replace words with other words.
 // It is used by processWord, below.
 // Example key-value pairs:
 //   "deleteMe": nil					 // removes "deleteMe"
 //   "rand": []string{"random"}			 // replace "rand" with "random"
 //   "utf-8": []string{"utf-8", "utf8"}  // add "utf8" whenever "utf-8" is seen
 var summaryReplacements = map[string][]string{
 	"postgres":   {"postgres", "postgresql"},
 	"postgresql": {"postgres", "postgresql"},
 	"rand":       {"random"},
 	"mongo":      {"mongo", "mongodb"},
 	"mongodb":    {"mongo", "mongodb"},
 	"redis":      {"redis", "redisdb"},
 	"redisdb":    {"redis", "redisdb"},
 	"logger":     {"logger", "log"}, // Postgres stemmer does not handle -er
 	"parser":     {"parser", "parse"},
 	"utf-8":      {"utf-8", "utf8"},
 }

 // processWord performs processing on s, returning zero or more words.
 // Its main purpose is to apply summaryReplacements to replace
 // certain words with synonyms or additional search terms.
 func processWord(s string) []string {
 	s = strings.TrimFunc(s, unicode.IsPunct)
 	if s == "" {
 		return nil
 	}
 	if rs, ok := summaryReplacements[s]; ok {
 		return rs
 	}
 	if !hyphenSplit(s) {
 		return []string{s}
 	}
 	// Apply replacements to parts of hyphenated words.
 	ws := strings.Split(s, "-")
 	if len(ws) == 1 {
 		return ws
 	}
 	result := []string{s} // Include the full hyphenated word.
 	for _, w := range ws {
 		if rs, ok := summaryReplacements[w]; ok {
 			result = append(result, rs...)
 		}
 		// We don't need to include the parts; the Postgres text-search processor will do that.
 	}
 	return result
 }

 // hyphenSplit reports whether s should be split on hyphens.
 func hyphenSplit(s string) bool {
 	return !(strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://"))
 }

 // isMarkdown reports whether filename says that the file contains markdown.
 func isMarkdown(filename string) bool {
 	ext := strings.ToLower(filepath.Ext(filename))
 	// https://tools.ietf.org/html/rfc7763 mentions both extensions.
 	return ext == ".md" || ext == ".markdown"
 }

 // processMarkdown returns the text of a markdown document.
 // It omits all formatting and images.
 func processMarkdown(s string) string {
 	parser := blackfriday.New(blackfriday.WithExtensions(blackfriday.CommonExtensions))
 	root := parser.Parse([]byte(s))
 	buf := walkMarkdown(root, nil, 0)
 	return string(buf)
 }

 // walkMarkdown traverses a blackfriday parse tree, extracting text.
 func walkMarkdown(n *blackfriday.Node, buf []byte, level int) []byte {
 	if n == nil {
 		return buf
 	}
 	switch n.Type {
 	case blackfriday.Image:
 		// Skip images because they usually are irrelevant to the package
 		// (badges and such).
 		return buf
 	case blackfriday.CodeBlock:
 		// Skip code blocks because they have a wide variety of unrelated symbols.
 		return buf
 	case blackfriday.Paragraph, blackfriday.Heading:
 		if len(buf) > 0 {
 			buf = append(buf, ' ')
 		}
 	default:
 		buf = append(buf, n.Literal...)
 	}
 	for c := n.FirstChild; c != nil; c = c.Next {
 		buf = walkMarkdown(c, buf, level+1)
 	}
 	return buf
 }
	// Copyright 2020 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package postgres

	import (
	"path/filepath"
	"strings"
	"unicode"

	"github.com/russross/blackfriday/v2"
	)

	const (
	maxSectionWords = 50
	maxReadmeFraction = 0.5
	)

	// SearchDocumentSections computes the B and C sections of a Postgres search
	// document from a package synopsis and a README.
	// By "B section" and "C section" we mean the portion of the tsvector with weight
	// "B" and "C", respectively.
	//
	// The B section consists of the synopsis.
	// The C section consists of the first sentence of the README.
	// The D section consists of the remainder of the README.
	// All sections are split into words and processed for replacements.
	// Each section is limited to maxSectionWords words, and in addition the
	// D section is limited to an initial fraction of the README, determined
	// by maxReadmeFraction.
	func SearchDocumentSections(synopsis, readmeFilename, readme string) (b, c, d string) {
	return searchDocumentSections(synopsis, readmeFilename, readme, maxSectionWords, maxReadmeFraction)
	}

	func searchDocumentSections(synopsis, readmeFilename, readme string, maxSecWords int, maxReadmeFrac float64) (b, c, d string) {
	var readmeFirst, readmeRest string
	if isMarkdown(readmeFilename) {
	readme = processMarkdown(readme)
	}
	if i := sentenceEndIndex(readme); i > 0 {
	readmeFirst, readmeRest = readme[:i+1], readme[i+1:]
	} else {
	readmeRest = readme
	}
	sw := processWords(synopsis)
	rwf := processWords(readmeFirst)
	rwr := processWords(readmeRest)

	sectionB, _ := split(sw, maxSecWords)
	sectionC, rwfd := split(rwf, maxSecWords)
	// section D is the part of the readme that is not in sectionC.
	rwd := append(rwfd, rwr...)
	// Keep maxSecWords of section D, but not more than maxReadmeFrac.
	f := int(maxReadmeFrac * float64(len(rwd)))
	nkeep := maxSecWords
	if nkeep > f {
	nkeep = f
	}
	sectionD, _ := split(rwd, nkeep)

	// If there is no synopsis, use first sentence of the README.
	// But do not promote the rest of the README to section C.
	if len(sectionB) == 0 {
	sectionB = sectionC
	sectionC = nil
	}

	prep := func(ws []string) string {
	return makeValidUnicode(strings.Join(ws, " "))
	}

	return prep(sectionB), prep(sectionC), prep(sectionD)
	}

	// split splits a slice of strings into two parts. The first has length <= n,
	// and the second is the rest of the slice. If n is negative, the first part is nil and
	// the second part is the entire slice.
	func split(a []string, n int) ([]string, []string) {
	if n >= len(a) {
	return a, nil
	}
	return a[:n], a[n:]
	}

	// sentenceEndIndex returns the index in s of the end of the first sentence, or
	// -1 if no end can be found. A sentence ends at a '.', '!' or '?' that is
	// followed by a space (or ends the string), and is not preceded by an
	// uppercase letter.
	func sentenceEndIndex(s string) int {
	var prev1, prev2 rune

	end := func() bool {
	return !unicode.IsUpper(prev2) && (prev1 == '.' \|\| prev1 == '!' \|\| prev1 == '?')
	}

	for i, r := range s {
	if unicode.IsSpace(r) && end() {
	return i - 1
	}
	prev2 = prev1
	prev1 = r
	}
	if end() {
	return len(s) - 1
	}
	return -1
	}

	// processWords splits s into words at whitespace, then processes each word.
	func processWords(s string) []string {
	fields := strings.Fields(strings.ToLower(s))
	var words []string
	for _, f := range fields {
	words = append(words, processWord(f)...)
	}
	return words
	}

	// summaryReplacements is used to replace words with other words.
	// It is used by processWord, below.
	// Example key-value pairs:
	// "deleteMe": nil // removes "deleteMe"
	// "rand": []string{"random"} // replace "rand" with "random"
	// "utf-8": []string{"utf-8", "utf8"} // add "utf8" whenever "utf-8" is seen
	var summaryReplacements = map[string][]string{
	"postgres": {"postgres", "postgresql"},
	"postgresql": {"postgres", "postgresql"},
	"rand": {"random"},
	"mongo": {"mongo", "mongodb"},
	"mongodb": {"mongo", "mongodb"},
	"redis": {"redis", "redisdb"},
	"redisdb": {"redis", "redisdb"},
	"logger": {"logger", "log"}, // Postgres stemmer does not handle -er
	"parser": {"parser", "parse"},
	"utf-8": {"utf-8", "utf8"},
	}

	// processWord performs processing on s, returning zero or more words.
	// Its main purpose is to apply summaryReplacements to replace
	// certain words with synonyms or additional search terms.
	func processWord(s string) []string {
	s = strings.TrimFunc(s, unicode.IsPunct)
	if s == "" {
	return nil
	}
	if rs, ok := summaryReplacements[s]; ok {
	return rs
	}
	if !hyphenSplit(s) {
	return []string{s}
	}
	// Apply replacements to parts of hyphenated words.
	ws := strings.Split(s, "-")
	if len(ws) == 1 {
	return ws
	}
	result := []string{s} // Include the full hyphenated word.
	for _, w := range ws {
	if rs, ok := summaryReplacements[w]; ok {
	result = append(result, rs...)
	}
	// We don't need to include the parts; the Postgres text-search processor will do that.
	}
	return result
	}

	// hyphenSplit reports whether s should be split on hyphens.
	func hyphenSplit(s string) bool {
	return !(strings.HasPrefix(s, "http://") \|\| strings.HasPrefix(s, "https://"))
	}

	// isMarkdown reports whether filename says that the file contains markdown.
	func isMarkdown(filename string) bool {
	ext := strings.ToLower(filepath.Ext(filename))
	// https://tools.ietf.org/html/rfc7763 mentions both extensions.
	return ext == ".md" \|\| ext == ".markdown"
	}

	// processMarkdown returns the text of a markdown document.
	// It omits all formatting and images.
	func processMarkdown(s string) string {
	parser := blackfriday.New(blackfriday.WithExtensions(blackfriday.CommonExtensions))
	root := parser.Parse([]byte(s))
	buf := walkMarkdown(root, nil, 0)
	return string(buf)
	}

	// walkMarkdown traverses a blackfriday parse tree, extracting text.
	func walkMarkdown(n *blackfriday.Node, buf []byte, level int) []byte {
	if n == nil {
	return buf
	}
	switch n.Type {
	case blackfriday.Image:
	// Skip images because they usually are irrelevant to the package
	// (badges and such).
	return buf
	case blackfriday.CodeBlock:
	// Skip code blocks because they have a wide variety of unrelated symbols.
	return buf
	case blackfriday.Paragraph, blackfriday.Heading:
	if len(buf) > 0 {
	buf = append(buf, ' ')
	}
	default:
	buf = append(buf, n.Literal...)
	}
	for c := n.FirstChild; c != nil; c = c.Next {
	buf = walkMarkdown(c, buf, level+1)
	}
	return buf
	}