blob: b667fe1dcdb33d243135a2ef21939df9fc96fd19 [file] [log] [blame]
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !aix && !plan9
// +build !aix,!plan9
package generators
import (
"compress/bzip2"
"io"
"os"
"path/filepath"
"github.com/blevesearch/bleve"
_ "github.com/blevesearch/bleve/analysis/analyzer/keyword"
wikiparse "github.com/dustin/go-wikiparse"
"golang.org/x/benchmarks/sweet/common"
blevebench "golang.org/x/benchmarks/third_party/bleve-bench"
)
// documents is the number of documents to index.
const documents = 25000
// wikiDumpPath is a path to the static asset from
// which we'll build our index.
var wikiDumpPath = filepath.Join("..", "bleve-index", wikiDumpName)
// BleveQuery is a dynamic assets Generator for the bleve-query benchmark.
type BleveQuery struct{}
// Generate creates a persistent index for the Bleve search engine for
// the bleve-query benchmark. It generates this index from a subset of
// the static assets for the bleve-index benchmark, a dump of wikipedia
// from 2008.
func (_ BleveQuery) Generate(cfg *common.GenConfig) (err error) {
// Copy README.md over.
if err := copyFiles(cfg.OutputDir, cfg.AssetsDir, []string{"README.md"}); err != nil {
return err
}
f, err := os.Open(filepath.Join(cfg.AssetsDir, wikiDumpPath))
if err != nil {
return err
}
defer f.Close()
z := bzip2.NewReader(f)
parser, err := wikiparse.NewParser(z)
if err != nil {
return err
}
// Create a new Bleve index with on-disk
// storage in the output directory.
mapping := blevebench.ArticleMapping()
outputDir := filepath.Join(cfg.OutputDir, "index")
index, err := bleve.New(outputDir, mapping)
if err != nil {
return err
}
defer func() {
// Make sure we close the index so the data
// persists to disk.
cerr := index.Close()
if err == nil {
err = cerr
}
}()
todo := ^uint64(0)
if documents >= 0 {
todo = uint64(documents)
}
// Create batches of wikipedia articles
// and index them.
const batchSize = 256
b := index.NewBatch()
for i := uint64(0); i < todo; i++ {
p, err := parser.Next()
if err == io.EOF {
break
} else if err != nil {
return err
}
if len(p.Revisions) == 0 {
continue
}
b.Index(p.Title, blevebench.Article{
Title: p.Title,
Text: p.Revisions[0].Text,
})
if b.Size() >= batchSize {
if err := index.Batch(b); err != nil {
return err
}
b = index.NewBatch()
}
}
if b.Size() != 0 {
if err := index.Batch(b); err != nil {
return err
}
}
return nil
}