vulndb/internal/audit/detect_callgraph.go - exp - Git at Google

 // Copyright 2021 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package audit

 import (
 	"container/list"
 	"fmt"
 	"go/token"
 	"log"
 	"strings"
 	"sync"

 	"golang.org/x/tools/go/callgraph"
 	"golang.org/x/tools/go/packages"
 	"golang.org/x/tools/go/ssa"
 	"golang.org/x/tools/go/ssa/ssautil"

 	"golang.org/x/tools/go/callgraph/cha"
 	"golang.org/x/tools/go/callgraph/vta"
 )

 // VulnerableSymbols returns vulnerability findings for symbols transitively reachable
 // through the callgraph built using VTA analysis from the entry points of pkgs, given
 // 'modVulns' vulnerabilities.
 //
 // Returns all findings reachable from pkgs while analyzing each package only once,
 // prefering findings of shorter import traces. For instance, given call chains
 //   A() -> B() -> V
 //   A() -> D() -> B() -> V
 //   D() -> B() -> V
 // where A and D are top level packages and V is a vulnerable symbol, VulnerableSymbols
 // can return either
 //   A() -> B() -> V
 // or
 //   D() -> B() -> V
 // as traces of transitively using a vulnerable symbol V.
 //
 // Findings for each vulnerability are sorted by estimated usefulness to the user.
 //
 // Panics if packages in pkgs do not belong to the same program.
 func VulnerableSymbols(pkgs []*ssa.Package, modVulns ModuleVulnerabilities) Results {
 	results := Results{
 		SearchMode:      CallGraphSearch,
 		Vulnerabilities: serialize(modVulns.Vulns()),
 		VulnFindings:    make(map[string][]Finding),
 	}
 	if len(modVulns) == 0 {
 		return results
 	}

 	prog := pkgsProgram(pkgs)
 	if prog == nil {
 		panic("packages in pkgs must belong to a single common program")
 	}
 	entries := entryPoints(pkgs)
 	callGraph := callGraph(prog, entries)

 	queue := list.New()
 	for _, entry := range entries {
 		queue.PushBack(&callChain{f: entry})
 	}

 	seen := make(map[*ssa.Function]bool)
 	for queue.Len() > 0 {
 		front := queue.Front()
 		v := front.Value.(*callChain)
 		queue.Remove(front)

 		if seen[v.f] {
 			continue
 		}
 		seen[v.f] = true

 		calls := funcVulnsAndCalls(v, modVulns, &results, callGraph)
 		for _, call := range calls {
 			queue.PushBack(call)
 		}
 	}

 	results.sort()
 	return results
 }

 // callGraph builds a call graph of prog based on VTA analysis.
 func callGraph(prog *ssa.Program, entries []*ssa.Function) *callgraph.Graph {
 	entrySlice := make(map[*ssa.Function]bool)
 	for _, e := range entries {
 		entrySlice[e] = true
 	}
 	initial := cha.CallGraph(prog)
 	allFuncs := ssautil.AllFunctions(prog)

 	fslice := forwardReachableFrom(entrySlice, initial)
 	// Keep only actually linked functions.
 	pruneSlice(fslice, allFuncs)
 	vtaCg := vta.CallGraph(fslice, initial)

 	// Repeat the process once more, this time using
 	// the produced VTA call graph as the base graph.
 	fslice = forwardReachableFrom(entrySlice, vtaCg)
 	pruneSlice(fslice, allFuncs)

 	return vta.CallGraph(fslice, vtaCg)
 }

 func entryPoints(topPackages []*ssa.Package) []*ssa.Function {
 	var entries []*ssa.Function
 	for _, pkg := range topPackages {
 		if pkg.Pkg.Name() == "main" {
 			// for "main" packages the only valid entry points are the "main"
 			// function and any "init#" functions, even if there are other
 			// exported functions or types. similarly to isEntry it should be
 			// safe to ignore the validity of the main or init# signatures,
 			// since the compiler will reject malformed definitions,
 			// and the init function is synthetic
 			entries = append(entries, memberFuncs(pkg.Members["main"], pkg.Prog)...)
 			for name, member := range pkg.Members {
 				if strings.HasPrefix(name, "init#") || name == "init" {
 					entries = append(entries, memberFuncs(member, pkg.Prog)...)
 				}
 			}
 			continue
 		}
 		for _, member := range pkg.Members {
 			for _, f := range memberFuncs(member, pkg.Prog) {
 				if isEntry(f) {
 					entries = append(entries, f)
 				}
 			}
 		}
 	}
 	return entries
 }

 func isEntry(f *ssa.Function) bool {
 	// it should be safe to ignore checking that the signature of the "init" function
 	// is valid, since it is synthetic
 	if f.Name() == "init" && f.Synthetic == "package initializer" {
 		return true
 	}

 	return f.Synthetic == "" && f.Object() != nil && f.Object().Exported()
 }

 // callChain helps doing BFS over package call graph while remembering the call stack.
 type callChain struct {
 	// nil for entry points of the chain.
 	call   ssa.CallInstruction
 	f      *ssa.Function
 	parent *callChain
 }

 func (chain *callChain) trace() []TraceElem {
 	if chain == nil {
 		return nil
 	}

 	var pos *token.Position
 	desc := fmt.Sprintf("%s.%s(...)", pkgPath(chain.f), dbFuncName(chain.f))
 	if chain.call != nil {
 		pos = instrPosition(chain.call)
 		if unresolved(chain.call) {
 			// In case of a statically unresolved call site, communicate to the client
 			// that this was approximatelly resolved to chain.f.
 			desc = fmt.Sprintf("%s(...) [approx. resolved to %s]", callName(chain.call), chain.f)
 		}
 	} else {
 		// No call information means the function is an entry point.
 		pos = funcPosition(chain.f)
 	}

 	return append(chain.parent.trace(), TraceElem{Description: desc, Position: pos})
 }

 // weight computes an approximate measure of how easy is to understand the call
 // chain when presented to the client as a trace. The smaller the value, the more
 // understendeable the chain is. Currently defined as the number of unresolved
 // call sites in the chain.
 func (chain *callChain) weight() int {
 	if chain == nil || chain.call == nil {
 		return 0
 	}

 	callWeight := 0
 	if unresolved(chain.call) {
 		callWeight = 1
 	}
 	return callWeight + chain.parent.weight()
 }

 // for assesing confidence level of findings.
 var stdPackages = make(map[string]bool)
 var loadStdsOnce sync.Once

 func isStdPackage(pkg *ssa.Package) bool {
 	if pkg == nil || pkg.Pkg == nil {
 		return false
 	}

 	loadStdsOnce.Do(func() {
 		pkgs, err := packages.Load(nil, "std")
 		if err != nil {
 			log.Printf("warning: unable to fetch list of std packages, ordering of findings might be affected: %v", err)
 		}

 		for _, p := range pkgs {
 			stdPackages[p.PkgPath] = true
 		}
 	})
 	return stdPackages[pkg.Pkg.Path()]
 }

 // confidence computes an approximate measure of whether the `chain`
 // represents a true finding. Currently, it equals the number of call
 // sites in `chain` that go through standard libraries. Such findings
 // have been experimentally shown to often result in false positives.
 //
 // TODO: add tests for trace confidence computation involving std libs.
 func (chain *callChain) confidence() int {
 	if chain == nil || chain.call == nil {
 		return 0
 	}

 	callConfidence := 0
 	if isStdPackage(chain.call.Parent().Pkg) {
 		callConfidence = 1
 	}
 	return callConfidence + chain.parent.confidence()
 }

 // funcVulnsAndCalls adds symbol findings to results for
 // function at the top of chain and next calls to analyze.
 func funcVulnsAndCalls(chain *callChain, modVulns ModuleVulnerabilities, results *Results, callGraph *callgraph.Graph) []*callChain {
 	var calls []*callChain
 	for _, b := range chain.f.Blocks {
 		for _, instr := range b.Instrs {
 			// First collect all findings for globals except callees in function call statements.
 			globalFindings(globalUses(instr), chain, modVulns, results)

 			// Callees are handled separately to produce call findings rather than global findings.
 			site, ok := instr.(ssa.CallInstruction)
 			if !ok {
 				continue
 			}

 			for _, callee := range siteCallees(site, callGraph) {
 				c := &callChain{call: site, f: callee, parent: chain}
 				calls = append(calls, c)
 				callFinding(c, modVulns, results)
 			}
 		}
 	}
 	return calls
 }

 // globalFindings adds findings for vulnerable globals among globalUses to results.
 // Assumes each use in globalUses is a use of a global variable. Can generate
 // duplicates when globalUses contains duplicates.
 func globalFindings(globalUses []*ssa.Value, chain *callChain, modVulns ModuleVulnerabilities, results *Results) {
 	if underRelatedVuln(chain, modVulns) {
 		return
 	}

 	for _, o := range globalUses {
 		g := (*o).(*ssa.Global)
 		vulns := modVulns.VulnsForSymbol(g.Package().Pkg.Path(), g.Name())
 		for _, v := range serialize(vulns) {
 			results.addFinding(v, Finding{
 				Symbol:     fmt.Sprintf("%s.%s", g.Package().Pkg.Path(), g.Name()),
 				Trace:      chain.trace(),
 				Position:   valPosition(*o, chain.f),
 				Type:       GlobalType,
 				weight:     chain.weight(),
 				confidence: chain.confidence()})
 		}
 	}
 }

 // callFinding adds findings to results for the call made at the top of the chain.
 // If there is no vulnerability or no call information, then nil is returned.
 // TODO(zpavlinovic): remove ssa info from higher-order calls.
 func callFinding(chain *callChain, modVulns ModuleVulnerabilities, results *Results) {
 	if underRelatedVuln(chain, modVulns) {
 		return
 	}

 	callee := chain.f
 	call := chain.call
 	if callee == nil || call == nil {
 		return
 	}

 	c := chain
 	if !unresolved(call) {
 		// If the last call is a resolved callsite, remove the edge from the trace as that
 		// information is provided in the symbol field.
 		c = c.parent
 	}

 	// some synthetic functions might not have an associated package
 	if callee.Package() == nil || callee.Package().Pkg == nil {
 		return
 	}

 	vulns := modVulns.VulnsForSymbol(callee.Package().Pkg.Path(), dbFuncName(callee))
 	for _, v := range serialize(vulns) {
 		results.addFinding(v, Finding{
 			Symbol:     fmt.Sprintf("%s.%s", callee.Package().Pkg.Path(), dbFuncName(callee)),
 			Trace:      c.trace(),
 			Position:   instrPosition(call),
 			Type:       FunctionType,
 			weight:     c.weight(),
 			confidence: c.confidence()})
 	}
 }

 // Checks if a potential vulnerability in chain.f is analyzed only because
 // a previous vulnerability in the same package as chain.f has been seen.
 // For instance, for the chain P1:A -> P2:B -> P2:C where both B and C are
 // vulnerable, the function returns true since B is already vulnerable and
 // has hence been reported. Clients are likely not interested in vulnerabilties
 // inside of a function that is already deemed vulnerable. This is an optimization
 // step to stop flooding of findings when a package has a lot of known vulnerable
 // symbols (e.g., all of them).
 //
 // Note that for P1:A -> P2:B -> P3:D -> P2:C the function returns false. This
 // is because C is called from D that comes from a different package.
 func underRelatedVuln(chain *callChain, modVulns ModuleVulnerabilities) bool {
 	pkg := pkgPath(chain.f)

 	c := chain
 	for {
 		c = c.parent
 		// Analyze the immediate substack related to pkg.
 		if c == nil || pkgPath(c.f) != pkg {
 			break
 		}

 		// some synthetic functions might not have an associated package
 		if c.f.Pkg == nil || c.f.Pkg.Pkg == nil {
 			continue
 		}

 		// TODO: can we optimize using the information on findings already reported?
 		if len(modVulns.VulnsForSymbol(c.f.Pkg.Pkg.Path(), dbFuncName(c.f))) > 0 {
 			return true
 		}
 	}
 	return false
 }
	// Copyright 2021 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package audit

	import (
	"container/list"
	"fmt"
	"go/token"
	"log"
	"strings"
	"sync"

	"golang.org/x/tools/go/callgraph"
	"golang.org/x/tools/go/packages"
	"golang.org/x/tools/go/ssa"
	"golang.org/x/tools/go/ssa/ssautil"

	"golang.org/x/tools/go/callgraph/cha"
	"golang.org/x/tools/go/callgraph/vta"
	)

	// VulnerableSymbols returns vulnerability findings for symbols transitively reachable
	// through the callgraph built using VTA analysis from the entry points of pkgs, given
	// 'modVulns' vulnerabilities.
	//
	// Returns all findings reachable from pkgs while analyzing each package only once,
	// prefering findings of shorter import traces. For instance, given call chains
	// A() -> B() -> V
	// A() -> D() -> B() -> V
	// D() -> B() -> V
	// where A and D are top level packages and V is a vulnerable symbol, VulnerableSymbols
	// can return either
	// A() -> B() -> V
	// or
	// D() -> B() -> V
	// as traces of transitively using a vulnerable symbol V.
	//
	// Findings for each vulnerability are sorted by estimated usefulness to the user.
	//
	// Panics if packages in pkgs do not belong to the same program.
	func VulnerableSymbols(pkgs []*ssa.Package, modVulns ModuleVulnerabilities) Results {
	results := Results{
	SearchMode: CallGraphSearch,
	Vulnerabilities: serialize(modVulns.Vulns()),
	VulnFindings: make(map[string][]Finding),
	}
	if len(modVulns) == 0 {
	return results
	}

	prog := pkgsProgram(pkgs)
	if prog == nil {
	panic("packages in pkgs must belong to a single common program")
	}
	entries := entryPoints(pkgs)
	callGraph := callGraph(prog, entries)

	queue := list.New()
	for _, entry := range entries {
	queue.PushBack(&callChain{f: entry})
	}

	seen := make(map[*ssa.Function]bool)
	for queue.Len() > 0 {
	front := queue.Front()
	v := front.Value.(*callChain)
	queue.Remove(front)

	if seen[v.f] {
	continue
	}
	seen[v.f] = true

	calls := funcVulnsAndCalls(v, modVulns, &results, callGraph)
	for _, call := range calls {
	queue.PushBack(call)
	}
	}

	results.sort()
	return results
	}

	// callGraph builds a call graph of prog based on VTA analysis.
	func callGraph(prog ssa.Program, entries []ssa.Function) *callgraph.Graph {
	entrySlice := make(map[*ssa.Function]bool)
	for _, e := range entries {
	entrySlice[e] = true
	}
	initial := cha.CallGraph(prog)
	allFuncs := ssautil.AllFunctions(prog)

	fslice := forwardReachableFrom(entrySlice, initial)
	// Keep only actually linked functions.
	pruneSlice(fslice, allFuncs)
	vtaCg := vta.CallGraph(fslice, initial)

	// Repeat the process once more, this time using
	// the produced VTA call graph as the base graph.
	fslice = forwardReachableFrom(entrySlice, vtaCg)
	pruneSlice(fslice, allFuncs)

	return vta.CallGraph(fslice, vtaCg)
	}

	func entryPoints(topPackages []ssa.Package) []ssa.Function {
	var entries []*ssa.Function
	for _, pkg := range topPackages {
	if pkg.Pkg.Name() == "main" {
	// for "main" packages the only valid entry points are the "main"
	// function and any "init#" functions, even if there are other
	// exported functions or types. similarly to isEntry it should be
	// safe to ignore the validity of the main or init# signatures,
	// since the compiler will reject malformed definitions,
	// and the init function is synthetic
	entries = append(entries, memberFuncs(pkg.Members["main"], pkg.Prog)...)
	for name, member := range pkg.Members {
	if strings.HasPrefix(name, "init#") \|\| name == "init" {
	entries = append(entries, memberFuncs(member, pkg.Prog)...)
	}
	}
	continue
	}
	for _, member := range pkg.Members {
	for _, f := range memberFuncs(member, pkg.Prog) {
	if isEntry(f) {
	entries = append(entries, f)
	}
	}
	}
	}
	return entries
	}

	func isEntry(f *ssa.Function) bool {
	// it should be safe to ignore checking that the signature of the "init" function
	// is valid, since it is synthetic
	if f.Name() == "init" && f.Synthetic == "package initializer" {
	return true
	}

	return f.Synthetic == "" && f.Object() != nil && f.Object().Exported()
	}

	// callChain helps doing BFS over package call graph while remembering the call stack.
	type callChain struct {
	// nil for entry points of the chain.
	call ssa.CallInstruction
	f *ssa.Function
	parent *callChain
	}

	func (chain *callChain) trace() []TraceElem {
	if chain == nil {
	return nil
	}

	var pos *token.Position
	desc := fmt.Sprintf("%s.%s(...)", pkgPath(chain.f), dbFuncName(chain.f))
	if chain.call != nil {
	pos = instrPosition(chain.call)
	if unresolved(chain.call) {
	// In case of a statically unresolved call site, communicate to the client
	// that this was approximatelly resolved to chain.f.
	desc = fmt.Sprintf("%s(...) [approx. resolved to %s]", callName(chain.call), chain.f)
	}
	} else {
	// No call information means the function is an entry point.
	pos = funcPosition(chain.f)
	}

	return append(chain.parent.trace(), TraceElem{Description: desc, Position: pos})
	}

	// weight computes an approximate measure of how easy is to understand the call
	// chain when presented to the client as a trace. The smaller the value, the more
	// understendeable the chain is. Currently defined as the number of unresolved
	// call sites in the chain.
	func (chain *callChain) weight() int {
	if chain == nil \|\| chain.call == nil {
	return 0
	}

	callWeight := 0
	if unresolved(chain.call) {
	callWeight = 1
	}
	return callWeight + chain.parent.weight()
	}

	// for assesing confidence level of findings.
	var stdPackages = make(map[string]bool)
	var loadStdsOnce sync.Once

	func isStdPackage(pkg *ssa.Package) bool {
	if pkg == nil \|\| pkg.Pkg == nil {
	return false
	}

	loadStdsOnce.Do(func() {
	pkgs, err := packages.Load(nil, "std")
	if err != nil {
	log.Printf("warning: unable to fetch list of std packages, ordering of findings might be affected: %v", err)
	}

	for _, p := range pkgs {
	stdPackages[p.PkgPath] = true
	}
	})
	return stdPackages[pkg.Pkg.Path()]
	}

	// confidence computes an approximate measure of whether the `chain`
	// represents a true finding. Currently, it equals the number of call
	// sites in `chain` that go through standard libraries. Such findings
	// have been experimentally shown to often result in false positives.
	//
	// TODO: add tests for trace confidence computation involving std libs.
	func (chain *callChain) confidence() int {
	if chain == nil \|\| chain.call == nil {
	return 0
	}

	callConfidence := 0
	if isStdPackage(chain.call.Parent().Pkg) {
	callConfidence = 1
	}
	return callConfidence + chain.parent.confidence()
	}

	// funcVulnsAndCalls adds symbol findings to results for
	// function at the top of chain and next calls to analyze.
	func funcVulnsAndCalls(chain callChain, modVulns ModuleVulnerabilities, results Results, callGraph callgraph.Graph) []callChain {
	var calls []*callChain
	for _, b := range chain.f.Blocks {
	for _, instr := range b.Instrs {
	// First collect all findings for globals except callees in function call statements.
	globalFindings(globalUses(instr), chain, modVulns, results)

	// Callees are handled separately to produce call findings rather than global findings.
	site, ok := instr.(ssa.CallInstruction)
	if !ok {
	continue
	}

	for _, callee := range siteCallees(site, callGraph) {
	c := &callChain{call: site, f: callee, parent: chain}
	calls = append(calls, c)
	callFinding(c, modVulns, results)
	}
	}
	}
	return calls
	}

	// globalFindings adds findings for vulnerable globals among globalUses to results.
	// Assumes each use in globalUses is a use of a global variable. Can generate
	// duplicates when globalUses contains duplicates.
	func globalFindings(globalUses []ssa.Value, chain callChain, modVulns ModuleVulnerabilities, results *Results) {
	if underRelatedVuln(chain, modVulns) {
	return
	}

	for _, o := range globalUses {
	g := (o).(ssa.Global)
	vulns := modVulns.VulnsForSymbol(g.Package().Pkg.Path(), g.Name())
	for _, v := range serialize(vulns) {
	results.addFinding(v, Finding{
	Symbol: fmt.Sprintf("%s.%s", g.Package().Pkg.Path(), g.Name()),
	Trace: chain.trace(),
	Position: valPosition(*o, chain.f),
	Type: GlobalType,
	weight: chain.weight(),
	confidence: chain.confidence()})
	}
	}
	}

	// callFinding adds findings to results for the call made at the top of the chain.
	// If there is no vulnerability or no call information, then nil is returned.
	// TODO(zpavlinovic): remove ssa info from higher-order calls.
	func callFinding(chain callChain, modVulns ModuleVulnerabilities, results Results) {
	if underRelatedVuln(chain, modVulns) {
	return
	}

	callee := chain.f
	call := chain.call
	if callee == nil \|\| call == nil {
	return
	}

	c := chain
	if !unresolved(call) {
	// If the last call is a resolved callsite, remove the edge from the trace as that
	// information is provided in the symbol field.
	c = c.parent
	}

	// some synthetic functions might not have an associated package
	if callee.Package() == nil \|\| callee.Package().Pkg == nil {
	return
	}

	vulns := modVulns.VulnsForSymbol(callee.Package().Pkg.Path(), dbFuncName(callee))
	for _, v := range serialize(vulns) {
	results.addFinding(v, Finding{
	Symbol: fmt.Sprintf("%s.%s", callee.Package().Pkg.Path(), dbFuncName(callee)),
	Trace: c.trace(),
	Position: instrPosition(call),
	Type: FunctionType,
	weight: c.weight(),
	confidence: c.confidence()})
	}
	}

	// Checks if a potential vulnerability in chain.f is analyzed only because
	// a previous vulnerability in the same package as chain.f has been seen.
	// For instance, for the chain P1:A -> P2:B -> P2:C where both B and C are
	// vulnerable, the function returns true since B is already vulnerable and
	// has hence been reported. Clients are likely not interested in vulnerabilties
	// inside of a function that is already deemed vulnerable. This is an optimization
	// step to stop flooding of findings when a package has a lot of known vulnerable
	// symbols (e.g., all of them).
	//
	// Note that for P1:A -> P2:B -> P3:D -> P2:C the function returns false. This
	// is because C is called from D that comes from a different package.
	func underRelatedVuln(chain *callChain, modVulns ModuleVulnerabilities) bool {
	pkg := pkgPath(chain.f)

	c := chain
	for {
	c = c.parent
	// Analyze the immediate substack related to pkg.
	if c == nil \|\| pkgPath(c.f) != pkg {
	break
	}

	// some synthetic functions might not have an associated package
	if c.f.Pkg == nil \|\| c.f.Pkg.Pkg == nil {
	continue
	}

	// TODO: can we optimize using the information on findings already reported?
	if len(modVulns.VulnsForSymbol(c.f.Pkg.Pkg.Path(), dbFuncName(c.f))) > 0 {
	return true
	}
	}
	return false
	}