// Copyright 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package postgres

import (
	"path/filepath"
	"strings"
	"unicode"

	"github.com/russross/blackfriday/v2"
)

const (
	maxSectionWords   = 50
	maxReadmeFraction = 0.5
)

// SearchDocumentSections computes the B and C sections of a Postgres search
// document from a package synopsis and a README.
// By "B section" and "C section" we mean the portion of the tsvector with weight
// "B" and "C", respectively.
//
// The B section consists of the synopsis.
// The C section consists of the first sentence of the README.
// The D section consists of the remainder of the README.
// All sections are split into words and processed for replacements.
// Each section is limited to maxSectionWords words, and in addition the
// D section is limited to an initial fraction of the README, determined
// by maxReadmeFraction.
func SearchDocumentSections(synopsis, readmeFilename, readme string) (b, c, d string) {
	return searchDocumentSections(synopsis, readmeFilename, readme, maxSectionWords, maxReadmeFraction)
}

func searchDocumentSections(synopsis, readmeFilename, readme string, maxSecWords int, maxReadmeFrac float64) (b, c, d string) {
	var readmeFirst, readmeRest string
	if isMarkdown(readmeFilename) {
		readme = processMarkdown(readme)
	}
	if i := sentenceEndIndex(readme); i > 0 {
		readmeFirst, readmeRest = readme[:i+1], readme[i+1:]
	} else {
		readmeRest = readme
	}
	sw := processWords(synopsis)
	rwf := processWords(readmeFirst)
	rwr := processWords(readmeRest)

	sectionB, _ := split(sw, maxSecWords)
	sectionC, rwfd := split(rwf, maxSecWords)
	// section D is the part of the readme that is not in sectionC.
	rwd := append(rwfd, rwr...)
	// Keep maxSecWords of section D, but not more than maxReadmeFrac.
	f := int(maxReadmeFrac * float64(len(rwd)))
	nkeep := maxSecWords
	if nkeep > f {
		nkeep = f
	}
	sectionD, _ := split(rwd, nkeep)

	// If there is no synopsis, use first sentence of the README.
	// But do not promote the rest of the README to section C.
	if len(sectionB) == 0 {
		sectionB = sectionC
		sectionC = nil
	}

	prep := func(ws []string) string {
		return makeValidUnicode(strings.Join(ws, " "))
	}

	return prep(sectionB), prep(sectionC), prep(sectionD)
}

// split splits a slice of strings into two parts. The first has length <= n,
// and the second is the rest of the slice. If n is negative, the first part is nil and
// the second part is the entire slice.
func split(a []string, n int) ([]string, []string) {
	if n >= len(a) {
		return a, nil
	}
	return a[:n], a[n:]
}

// sentenceEndIndex returns the index in s of the end of the first sentence, or
// -1 if no end can be found. A sentence ends at a '.', '!' or '?' that is
// followed by a space (or ends the string), and is not preceded by an
// uppercase letter.
func sentenceEndIndex(s string) int {
	var prev1, prev2 rune

	end := func() bool {
		return !unicode.IsUpper(prev2) && (prev1 == '.' || prev1 == '!' || prev1 == '?')
	}

	for i, r := range s {
		if unicode.IsSpace(r) && end() {
			return i - 1
		}
		prev2 = prev1
		prev1 = r
	}
	if end() {
		return len(s) - 1
	}
	return -1
}

// processWords splits s into words at whitespace, then processes each word.
func processWords(s string) []string {
	fields := strings.Fields(strings.ToLower(s))
	var words []string
	for _, f := range fields {
		words = append(words, processWord(f)...)
	}
	return words
}

// summaryReplacements is used to replace words with other words.
// It is used by processWord, below.
// Example key-value pairs:
//   "deleteMe": nil					 // removes "deleteMe"
//   "rand": []string{"random"}			 // replace "rand" with "random"
//   "utf-8": []string{"utf-8", "utf8"}  // add "utf8" whenever "utf-8" is seen
var summaryReplacements = map[string][]string{
	"postgres":   []string{"postgres", "postgresql"},
	"postgresql": []string{"postgres", "postgresql"},
	"rand":       []string{"random"},
	"mongo":      []string{"mongo", "mongodb"},
	"mongodb":    []string{"mongo", "mongodb"},
	"redis":      []string{"redis", "redisdb"},
	"redisdb":    []string{"redis", "redisdb"},
	"logger":     []string{"logger", "log"}, // Postgres stemmer does not handle -er
	"parser":     []string{"parser", "parse"},
	"utf-8":      []string{"utf-8", "utf8"},
}

// processWord performs processing on s, returning zero or more words.
// Its main purpose is to apply summaryReplacements to replace
// certain words with synonyms or additional search terms.
func processWord(s string) []string {
	s = strings.TrimFunc(s, unicode.IsPunct)
	if s == "" {
		return nil
	}
	if rs, ok := summaryReplacements[s]; ok {
		return rs
	}
	if !hyphenSplit(s) {
		return []string{s}
	}
	// Apply replacements to parts of hyphenated words.
	ws := strings.Split(s, "-")
	if len(ws) == 1 {
		return ws
	}
	result := []string{s} // Include the full hyphenated word.
	for _, w := range ws {
		if rs, ok := summaryReplacements[w]; ok {
			result = append(result, rs...)
		}
		// We don't need to include the parts; the Postgres text-search processor will do that.
	}
	return result
}

// hyphenSplit reports whether s should be split on hyphens.
func hyphenSplit(s string) bool {
	return !(strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://"))
}

// isMarkdown reports whether filename says that the file contains markdown.
func isMarkdown(filename string) bool {
	ext := strings.ToLower(filepath.Ext(filename))
	// https://tools.ietf.org/html/rfc7763 mentions both extensions.
	return ext == ".md" || ext == ".markdown"
}

// processMarkdown returns the text of a markdown document.
// It omits all formatting and images.
func processMarkdown(s string) string {
	parser := blackfriday.New(blackfriday.WithExtensions(blackfriday.CommonExtensions))
	root := parser.Parse([]byte(s))
	buf := walkMarkdown(root, nil, 0)
	return string(buf)
}

// walkMarkdown traverses a blackfriday parse tree, extracting text.
func walkMarkdown(n *blackfriday.Node, buf []byte, level int) []byte {
	if n == nil {
		return buf
	}
	switch n.Type {
	case blackfriday.Image:
		// Skip images because they usually are irrelevant to the package
		// (badges and such).
		return buf
	case blackfriday.CodeBlock:
		// Skip code blocks because they have a wide variety of unrelated symbols.
		return buf
	case blackfriday.Paragraph, blackfriday.Heading:
		if len(buf) > 0 {
			buf = append(buf, ' ')
		}
	default:
		buf = append(buf, n.Literal...)
	}
	for c := n.FirstChild; c != nil; c = c.Next {
		buf = walkMarkdown(c, buf, level+1)
	}
	return buf
}
