// Copyright 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package licenses detects licenses and determines whether they are redistributable.
// The functions in this package do not return errors; instead, they log any problems
// they encounter and fail closed by reporting that the module or package is not
// redistributable.
//
// Example (modproxy):
//   d := licenses.NewDetector(modulePath, version, zipReader, log.Infof)
//   modRedist := d.ModuleIsRedistributable()
//
// Example (discovery):
//   d := licenses.NewDetector(modulePath, version, zipReader, log.Infof)
//   modRedist := d.ModuleIsRedistributable()
//   lics := d.AllLicenses()
//   pkgRedist, pkgMetas := d.PackageInfo(pkgSubdir)
package licenses

import (
	"archive/zip"
	"context"
	"fmt"
	"io"
	"io/ioutil"
	"path"
	"path/filepath"
	"sort"
	"strings"
	"sync"

	"github.com/google/licensecheck"
	oldlicensecheck "github.com/google/licensecheck/old"
	"golang.org/x/mod/module"
	modzip "golang.org/x/mod/zip"
	"golang.org/x/pkgsite/internal/log"
)

//go:generate rm -f exceptions.gen.go
//go:generate go run gen_exceptions.go

const (
	// coverageThreshold is the minimum percentage of the file that must contain
	// license text.
	coverageThreshold = 75

	// unknownLicenseType is for text in a license file that's not recognized.
	unknownLicenseType = "UNKNOWN"
)

// maxLicenseSize is the maximum allowable size (in bytes) for a license file.
// There are some license files larger than 1 million bytes: https://github.com/vmware/vic/LICENSE
// and github.com/goharbor/harbor/LICENSE, for example.
// var for testing
var maxLicenseSize uint64 = modzip.MaxLICENSE

// Metadata holds information extracted from a license file.
type Metadata struct {
	// Types is the set of license types, as determined by the licensecheck package.
	Types []string
	// FilePath is the '/'-separated path to the license file in the module zip,
	// relative to the contents directory.
	FilePath string
	// The output of oldlicensecheck.Cover.
	OldCoverage oldlicensecheck.Coverage
	Coverage    licensecheck.Coverage
}

// A License is a classified license file path and its contents.
type License struct {
	*Metadata
	Contents []byte
}

// RemoveNonRedistributableData methods removes the license contents
// if the license is non-redistributable.
func (l *License) RemoveNonRedistributableData() {
	if !Redistributable(l.Types) {
		l.Contents = nil
	}
}

var (
	FileNames = []string{
		"COPYING",
		"COPYING.md",
		"COPYING.markdown",
		"COPYING.txt",
		"LICENCE",
		"LICENCE.md",
		"LICENCE.markdown",
		"LICENCE.txt",
		"LICENSE",
		"LICENSE.md",
		"LICENSE.markdown",
		"LICENSE.txt",
		"LICENSE-2.0.txt",
		"LICENCE-2.0.txt",
		"LICENSE-APACHE",
		"LICENCE-APACHE",
		"LICENSE-APACHE-2.0.txt",
		"LICENCE-APACHE-2.0.txt",
		"LICENSE-MIT",
		"LICENCE-MIT",
		"LICENSE.MIT",
		"LICENCE.MIT",
		"LICENSE.code",
		"LICENCE.code",
		"LICENSE.docs",
		"LICENCE.docs",
		"LICENSE.rst",
		"LICENCE.rst",
		"MIT-LICENSE",
		"MIT-LICENCE",
		"MIT-LICENSE.md",
		"MIT-LICENCE.md",
		"MIT-LICENSE.markdown",
		"MIT-LICENCE.markdown",
		"MIT-LICENSE.txt",
		"MIT-LICENCE.txt",
		"MIT_LICENSE",
		"MIT_LICENCE",
		"UNLICENSE",
		"UNLICENCE",
	}

	// standardRedistributableLicenseTypes is the list of license types, as reported by
	// licensecheck, that allow redistribution, and also have a name that is an OSI or SPDX
	// identifier.
	standardRedistributableLicenseTypes = []string{
		// Licenses acceptable by OSI.
		"AFL-3.0",
		"AGPL-3.0",
		"AGPL-3.0-only",
		"AGPL-3.0-or-later",
		"Apache-1.1",
		"Apache-2.0",
		"Artistic-2.0",
		"BlueOak-1.0.0",
		"0BSD",
		"BSD-1-Clause",
		"BSD-2-Clause",
		"BSD-2-Clause-Patent",
		"BSD-2-Clause-Views",
		"BSD-3-Clause",
		"BSD-3-Clause-Clear",
		"BSD-3-Clause-Open-MPI",
		"BSD-4-Clause",
		"BSD-4-Clause-UC",
		"BSL-1.0",
		"CC-BY-3.0",
		"CC-BY-4.0",
		"CC-BY-SA-3.0",
		"CC-BY-SA-4.0",
		"CC0-1.0",
		"EPL-1.0",
		"EPL-2.0",
		"EUPL-1.2",
		"GPL-2.0",
		"GPL-2.0-only",
		"GPL-2.0-or-later",
		"GPL-3.0",
		"GPL-3.0-only",
		"GPL-3.0-or-later",
		"HPND",
		"ISC",
		"JSON",
		"LGPL-2.1",
		"LGPL-2.1-or-later",
		"LGPL-3.0",
		"LGPL-3.0-or-later",
		"MIT",
		"MIT-0",
		"MPL-2.0",
		"MPL-2.0-no-copyleft-exception",
		"NIST-PD",
		"NIST-PD-fallback",
		"NCSA",
		"OpenSSL",
		"OSL-3.0",
		"PostgreSQL", // TODO: ask legal
		"Python-2.0",
		"Unlicense",
		"UPL-1.0",
		"Zlib",
	}

	// These aren't technically licenses, but they are recognized by
	// licensecheck and safe to ignore.
	ignorableLicenseTypes = map[string]bool{
		"CC-Notice":          true,
		"GooglePatentClause": true,
		"GooglePatentsFile":  true,
		"blessing":           true,
		"OFL-1.1":            true, // concerns fonts only
	}

	// redistributableLicenseTypes is the set of license types, as reported by
	// licensecheck, that allow redistribution. It consists of the standard
	// types along with some exception types.
	redistributableLicenseTypes = map[string]bool{}
)

func init() {
	for _, t := range standardRedistributableLicenseTypes {
		redistributableLicenseTypes[t] = true
	}
	// Add here all other types defined in the exceptions.
	redistributableLicenseTypes["Freetype"] = true

	// exceptionTypes is a map from License IDs from LREs in the exception
	// directory to license types. Any type mentioned in an exception should
	// be redistributable. If not, there's a problem.
	for _, types := range exceptionTypes {
		for _, t := range types {
			if !redistributableLicenseTypes[t] {
				log.Fatalf(context.Background(), "%s is an exception type that is not redistributable.", t)
			}
		}
	}
}

// nonOSILicenses lists licenses that are not approved by OSI.
var nonOSILicenses = map[string]bool{
	"BlueOak-1.0.0":      true,
	"BSD-2-Clause-Views": true,
	"CC-BY-3.0":          true,
	"CC-BY-4.0":          true,
	"CC-BY-SA-3.0":       true,
	"CC-BY-SA-4.0":       true,
	"CC0-1.0":            true,
	"JSON":               true,
	"NIST":               true,
	"OpenSSL":            true,
}

// fileNamesLowercase has all the entries of FileNames, downcased and made a set
// for fast case-insensitive matching.
var fileNamesLowercase = map[string]bool{}

func init() {
	for _, f := range FileNames {
		fileNamesLowercase[strings.ToLower(f)] = true
	}
}

// AcceptedLicenseInfo describes a license that is accepted by the discovery site.
type AcceptedLicenseInfo struct {
	Name string
	URL  string
}

// AcceptedLicenses returns a sorted slice of license types that are accepted as
// redistributable. Its result is intended to be displayed to users.
func AcceptedLicenses() []AcceptedLicenseInfo {
	var lics []AcceptedLicenseInfo
	for _, identifier := range standardRedistributableLicenseTypes {
		var link string
		if nonOSILicenses[identifier] {
			link = fmt.Sprintf("https://spdx.org/licenses/%s.html", identifier)
		} else {
			link = fmt.Sprintf("https://opensource.org/licenses/%s", identifier)
		}
		lics = append(lics, AcceptedLicenseInfo{identifier, link})
	}
	sort.Slice(lics, func(i, j int) bool { return lics[i].Name < lics[j].Name })
	return lics
}

var (
	// OmitExceptions causes the list of exceptions to be omitted from license detection.
	// It is intended only to speed up testing, and must be set before the first use
	// of this package.
	OmitExceptions bool

	_scanner    *licensecheck.Scanner
	scannerOnce sync.Once
)

func scanner() *licensecheck.Scanner {
	scannerOnce.Do(func() {
		if OmitExceptions {
			exceptionLicenses = nil
		}
		var err error
		_scanner, err = licensecheck.NewScanner(append(exceptionLicenses, licensecheck.BuiltinLicenses()...))
		if err != nil {
			log.Fatalf(context.Background(), "licensecheck.NewScanner: %v", err)
		}
	})
	return _scanner
}

// A Detector detects licenses in a module and its packages.
type Detector struct {
	modulePath     string
	version        string
	zr             *zip.Reader
	logf           func(string, ...interface{})
	moduleRedist   bool
	moduleLicenses []*License // licenses at module root directory, or list from exceptions
	allLicenses    []*License
	licsByDir      map[string][]*License // from directory to list of licenses
}

// NewDetector returns a Detector for the given module and version.
// zr should be the zip file for that module and version.
// logf is for logging; if nil, no logging is done.
func NewDetector(modulePath, version string, zr *zip.Reader, logf func(string, ...interface{})) *Detector {
	if logf == nil {
		logf = func(string, ...interface{}) {}
	}
	d := &Detector{
		modulePath: modulePath,
		version:    version,
		zr:         zr,
		logf:       logf,
	}
	d.computeModuleInfo()
	return d
}

// ModuleIsRedistributable reports whether the given module is redistributable.
func (d *Detector) ModuleIsRedistributable() bool {
	return d.moduleRedist
}

// ModuleLicenses returns the licenses that apply to the module.
func (d *Detector) ModuleLicenses() []*License {
	return d.moduleLicenses
}

// AllLicenses returns all the licenses detected in the entire module, including
// package licenses.
func (d *Detector) AllLicenses() []*License {
	if d.allLicenses == nil {
		d.computeAllLicenseInfo()
	}
	return d.allLicenses
}

// PackageInfo reports whether the package at dir, a directory relative to the
// module root, is redistributable. It also returns all the licenses that apply
// to the package.
func (d *Detector) PackageInfo(dir string) (isRedistributable bool, lics []*License) {
	cleanDir := filepath.ToSlash(filepath.Clean(dir))
	if path.IsAbs(cleanDir) || strings.HasPrefix(cleanDir, "..") {
		return false, nil
	}
	if d.allLicenses == nil {
		d.computeAllLicenseInfo()
	}
	// Collect all the license metadata for directories dir and above, excluding the root.
	for prefix, plics := range d.licsByDir {
		// append a slash so that prefix a/b does not match a/bc/d
		if strings.HasPrefix(cleanDir+"/", prefix+"/") {
			lics = append(lics, plics...)
		}
	}
	// A package is redistributable if its module is, and if other licenses on
	// the path to the root are redistributable. Note that this is not the same
	// as asking if the module licenses plus the package licenses are
	// redistributable. A module that is granted an exception (see DetectFiles)
	// may have licenses that are non-redistributable.
	ltypes := types(lics)
	isRedistributable = d.ModuleIsRedistributable() && (len(ltypes) == 0 || Redistributable(ltypes))
	// A package's licenses include the ones we've already computed, as well
	// as the module licenses.
	return isRedistributable, append(lics, d.moduleLicenses...)
}

// computeModuleInfo determines values for the moduleRedist and moduleLicenses fields of d.
func (d *Detector) computeModuleInfo() {
	// Check that all licenses in the contents directory are redistributable.
	d.moduleLicenses = d.detectFiles(d.Files(RootFiles))
	d.moduleRedist = Redistributable(types(d.moduleLicenses))
}

// computeAllLicenseInfo collects all the detected licenses in the zip and
// stores them in the allLicenses field of d. It also maps detected licenses to
// their directories, to optimize Detector.PackageInfo.
func (d *Detector) computeAllLicenseInfo() {
	d.allLicenses = []*License{}
	d.allLicenses = append(d.allLicenses, d.moduleLicenses...)
	nonRootLicenses := d.detectFiles(d.Files(NonRootFiles))
	d.allLicenses = append(d.allLicenses, nonRootLicenses...)
	d.licsByDir = map[string][]*License{}
	for _, l := range nonRootLicenses {
		prefix := path.Dir(l.FilePath)
		d.licsByDir[prefix] = append(d.licsByDir[prefix], l)
	}
}

// WhichFiles describes which files from the zip should be returned by Detector.Files.
type WhichFiles int

const (
	// Only files from the root (contents) directory.
	RootFiles WhichFiles = iota
	// Only files that are not in the root directory.
	NonRootFiles
	// All files; the union of root and non-root.
	AllFiles
)

// Files returns a list of license files from the zip. The which argument
// determines the location of the files considered.
func (d *Detector) Files(which WhichFiles) []*zip.File {
	cdir := contentsDir(d.modulePath, d.version)
	prefix := pathPrefix(cdir)
	var files []*zip.File
	for _, f := range d.zr.File {
		if !fileNamesLowercase[strings.ToLower(path.Base(f.Name))] {
			continue
		}
		if !strings.HasPrefix(f.Name, prefix) {
			d.logf("potential license file %q found outside of the expected path %q", f.Name, cdir)
			continue
		}
		// Skip files we should ignore.
		if ignoreFiles[d.modulePath+" "+strings.TrimPrefix(f.Name, prefix)] {
			continue
		}
		if which == RootFiles && path.Dir(f.Name) != cdir {
			// Skip f since it's not at root.
			continue
		}
		if which == NonRootFiles && path.Dir(f.Name) == cdir {
			// Skip f since it is at root.
			continue
		}
		if isVendoredFile(f.Name) {
			// Skip if f is in the vendor directory.
			continue
		}
		if err := module.CheckFilePath(f.Name); err != nil {
			// Skip if the file path is bad.
			d.logf("module.CheckFilePath(%q): %v", f.Name, err)
			continue
		}
		files = append(files, f)
	}
	return files
}

// isVendoredFile reports if the given file is in a proper subdirectory nested
// under a 'vendor' directory, to allow for Go packages named 'vendor'.
//
// e.g. isVendoredFile("vendor/LICENSE") == false, and
//      isVendoredFile("vendor/foo/LICENSE") == true
func isVendoredFile(name string) bool {
	var vendorOffset int
	if strings.HasPrefix(name, "vendor/") {
		vendorOffset = len("vendor/")
	} else if i := strings.Index(name, "/vendor/"); i >= 0 {
		vendorOffset = i + len("/vendor/")
	} else {
		// no vendor directory
		return false
	}
	// check if the file is in a proper subdirectory of vendor
	return strings.Contains(name[vendorOffset:], "/")
}

// detectFiles runs DetectFile on each of the given files.
// If a file cannot be read, the error is logged and a license
// of type unknown is added.
func (d *Detector) detectFiles(files []*zip.File) []*License {
	prefix := pathPrefix(contentsDir(d.modulePath, d.version))
	var licenses []*License
	for _, f := range files {
		bytes, err := readZipFile(f)
		if err != nil {
			d.logf("reading zip file %s: %v", f.Name, err)
			licenses = append(licenses, &License{
				Metadata: &Metadata{
					Types:    []string{unknownLicenseType},
					FilePath: strings.TrimPrefix(f.Name, prefix),
				},
			})
			continue
		}
		types, cov := DetectFile(bytes, f.Name, d.logf)
		licenses = append(licenses, &License{
			Metadata: &Metadata{
				Types:    types,
				FilePath: strings.TrimPrefix(f.Name, prefix),
				Coverage: cov,
			},
			Contents: bytes,
		})
	}
	return licenses
}

// DetectFile return the set of license types for the given file contents. It
// also returns the licensecheck coverage information. The filename is used
// solely for logging.
func DetectFile(contents []byte, filename string, logf func(string, ...interface{})) ([]string, licensecheck.Coverage) {
	if logf == nil {
		logf = func(string, ...interface{}) {}
	}
	cov := scanner().Scan(contents)
	if cov.Percent < float64(coverageThreshold) {
		logf("%s license coverage too low (%+v), skipping", filename, cov)
		return []string{unknownLicenseType}, cov
	}
	types := make(map[string]bool)
	for _, m := range cov.Match {
		ts := exceptionTypes[m.ID]
		if ts == nil {
			ts = []string{m.ID}
		}
		for _, t := range ts {
			types[t] = true
		}
	}
	if len(types) == 0 {
		logf("%s failed to classify license (%+v), skipping", filename, cov)
		return []string{unknownLicenseType}, cov
	}
	return setToSortedSlice(types), cov
}

// Redistributable reports whether the set of license types establishes that a
// module or package is redistributable.
// All the licenses we see that are relevant must be redistributable, and
// we must see at least one such license.
func Redistributable(licenseTypes []string) bool {
	sawRedist := false
	for _, t := range licenseTypes {
		if ignorableLicenseTypes[t] {
			continue
		}
		if !redistributableLicenseTypes[t] {
			return false
		}
		sawRedist = true
	}
	return sawRedist
}

func types(lics []*License) []string {
	var types []string
	for _, l := range lics {
		types = append(types, l.Types...)
	}
	return types
}

func setToSortedSlice(m map[string]bool) []string {
	var s []string
	for e := range m {
		s = append(s, e)
	}
	sort.Strings(s)
	return s
}

func readZipFile(f *zip.File) ([]byte, error) {
	if f.UncompressedSize64 > maxLicenseSize {
		return nil, fmt.Errorf("file size %d exceeds max license size %d", f.UncompressedSize64, maxLicenseSize)
	}
	rc, err := f.Open()
	if err != nil {
		return nil, err
	}
	defer rc.Close()
	return ioutil.ReadAll(io.LimitReader(rc, int64(maxLicenseSize)))
}

func contentsDir(modulePath, version string) string {
	return modulePath + "@" + version
}

// pathPrefix appends a "/" to its argument if the argument is non-empty.
func pathPrefix(s string) string {
	if s != "" {
		return s + "/"
	}
	return ""
}
