[release-branch.go1.18] regexp: limit size of parsed regexps

Set a 128 MB limit on the amount of space used by []syntax.Inst
in the compiled form corresponding to a given regexp.

Also set a 128 MB limit on the rune storage in the *syntax.Regexp
tree itself.

Thanks to Adam Korczynski (ADA Logics) and OSS-Fuzz for reporting this issue.

Fixes CVE-2022-41715.
Updates #55949.
Fixes #55950.

Change-Id: Ia656baed81564436368cf950e1c5409752f28e1b
Reviewed-on: https://team-review.git.corp.google.com/c/golang/go-private/+/1592136
TryBot-Result: Security TryBots <security-trybots@go-security-trybots.iam.gserviceaccount.com>
Reviewed-by: Damien Neil <dneil@google.com>
Run-TryBot: Roland Shoemaker <bracewell@google.com>
Reviewed-by: Julie Qiu <julieqiu@google.com>
Reviewed-on: https://go-review.googlesource.com/c/go/+/438501
Run-TryBot: Carlos Amedee <carlos@golang.org>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@golang.org>
diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go
index 0f6587a..df83980 100644
--- a/src/regexp/syntax/parse.go
+++ b/src/regexp/syntax/parse.go
@@ -90,15 +90,49 @@
 // until we've allocated at least maxHeight Regexp structures.
 const maxHeight = 1000
 
+// maxSize is the maximum size of a compiled regexp in Insts.
+// It too is somewhat arbitrarily chosen, but the idea is to be large enough
+// to allow significant regexps while at the same time small enough that
+// the compiled form will not take up too much memory.
+// 128 MB is enough for a 3.3 million Inst structures, which roughly
+// corresponds to a 3.3 MB regexp.
+const (
+	maxSize  = 128 << 20 / instSize
+	instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words
+)
+
+// maxRunes is the maximum number of runes allowed in a regexp tree
+// counting the runes in all the nodes.
+// Ignoring character classes p.numRunes is always less than the length of the regexp.
+// Character classes can make it much larger: each \pL adds 1292 runes.
+// 128 MB is enough for 32M runes, which is over 26k \pL instances.
+// Note that repetitions do not make copies of the rune slices,
+// so \pL{1000} is only one rune slice, not 1000.
+// We could keep a cache of character classes we've seen,
+// so that all the \pL we see use the same rune list,
+// but that doesn't remove the problem entirely:
+// consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].
+// And because the Rune slice is exposed directly in the Regexp,
+// there is not an opportunity to change the representation to allow
+// partial sharing between different character classes.
+// So the limit is the best we can do.
+const (
+	maxRunes = 128 << 20 / runeSize
+	runeSize = 4 // rune is int32
+)
+
 type parser struct {
 	flags       Flags     // parse mode flags
 	stack       []*Regexp // stack of parsed expressions
 	free        *Regexp
 	numCap      int // number of capturing groups seen
 	wholeRegexp string
-	tmpClass    []rune          // temporary char class work space
-	numRegexp   int             // number of regexps allocated
-	height      map[*Regexp]int // regexp height for height limit check
+	tmpClass    []rune            // temporary char class work space
+	numRegexp   int               // number of regexps allocated
+	numRunes    int               // number of runes in char classes
+	repeats     int64             // product of all repetitions seen
+	height      map[*Regexp]int   // regexp height, for height limit check
+	size        map[*Regexp]int64 // regexp compiled size, for size limit check
 }
 
 func (p *parser) newRegexp(op Op) *Regexp {
@@ -122,6 +156,104 @@
 	p.free = re
 }
 
+func (p *parser) checkLimits(re *Regexp) {
+	if p.numRunes > maxRunes {
+		panic(ErrInternalError)
+	}
+	p.checkSize(re)
+	p.checkHeight(re)
+}
+
+func (p *parser) checkSize(re *Regexp) {
+	if p.size == nil {
+		// We haven't started tracking size yet.
+		// Do a relatively cheap check to see if we need to start.
+		// Maintain the product of all the repeats we've seen
+		// and don't track if the total number of regexp nodes
+		// we've seen times the repeat product is in budget.
+		if p.repeats == 0 {
+			p.repeats = 1
+		}
+		if re.Op == OpRepeat {
+			n := re.Max
+			if n == -1 {
+				n = re.Min
+			}
+			if n <= 0 {
+				n = 1
+			}
+			if int64(n) > maxSize/p.repeats {
+				p.repeats = maxSize
+			} else {
+				p.repeats *= int64(n)
+			}
+		}
+		if int64(p.numRegexp) < maxSize/p.repeats {
+			return
+		}
+
+		// We need to start tracking size.
+		// Make the map and belatedly populate it
+		// with info about everything we've constructed so far.
+		p.size = make(map[*Regexp]int64)
+		for _, re := range p.stack {
+			p.checkSize(re)
+		}
+	}
+
+	if p.calcSize(re, true) > maxSize {
+		panic(ErrInternalError)
+	}
+}
+
+func (p *parser) calcSize(re *Regexp, force bool) int64 {
+	if !force {
+		if size, ok := p.size[re]; ok {
+			return size
+		}
+	}
+
+	var size int64
+	switch re.Op {
+	case OpLiteral:
+		size = int64(len(re.Rune))
+	case OpCapture, OpStar:
+		// star can be 1+ or 2+; assume 2 pessimistically
+		size = 2 + p.calcSize(re.Sub[0], false)
+	case OpPlus, OpQuest:
+		size = 1 + p.calcSize(re.Sub[0], false)
+	case OpConcat:
+		for _, sub := range re.Sub {
+			size += p.calcSize(sub, false)
+		}
+	case OpAlternate:
+		for _, sub := range re.Sub {
+			size += p.calcSize(sub, false)
+		}
+		if len(re.Sub) > 1 {
+			size += int64(len(re.Sub)) - 1
+		}
+	case OpRepeat:
+		sub := p.calcSize(re.Sub[0], false)
+		if re.Max == -1 {
+			if re.Min == 0 {
+				size = 2 + sub // x*
+			} else {
+				size = 1 + int64(re.Min)*sub // xxx+
+			}
+			break
+		}
+		// x{2,5} = xx(x(x(x)?)?)?
+		size = int64(re.Max)*sub + int64(re.Max-re.Min)
+	}
+
+	if size < 1 {
+		size = 1
+	}
+	p.size[re] = size
+	return size
+}
+
 func (p *parser) checkHeight(re *Regexp) {
 	if p.numRegexp < maxHeight {
 		return
@@ -158,6 +290,7 @@
 
 // push pushes the regexp re onto the parse stack and returns the regexp.
 func (p *parser) push(re *Regexp) *Regexp {
+	p.numRunes += len(re.Rune)
 	if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
 		// Single rune.
 		if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
@@ -189,7 +322,7 @@
 	}
 
 	p.stack = append(p.stack, re)
-	p.checkHeight(re)
+	p.checkLimits(re)
 	return re
 }
 
@@ -299,7 +432,7 @@
 	re.Sub = re.Sub0[:1]
 	re.Sub[0] = sub
 	p.stack[n-1] = re
-	p.checkHeight(re)
+	p.checkLimits(re)
 
 	if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) {
 		return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
@@ -503,6 +636,7 @@
 
 			for j := start; j < i; j++ {
 				sub[j] = p.removeLeadingString(sub[j], len(str))
+				p.checkLimits(sub[j])
 			}
 			suffix := p.collapse(sub[start:i], OpAlternate) // recurse
 
@@ -560,6 +694,7 @@
 			for j := start; j < i; j++ {
 				reuse := j != start // prefix came from sub[start]
 				sub[j] = p.removeLeadingRegexp(sub[j], reuse)
+				p.checkLimits(sub[j])
 			}
 			suffix := p.collapse(sub[start:i], OpAlternate) // recurse
 
diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go
index 1ef6d8a..67e3c56 100644
--- a/src/regexp/syntax/parse_test.go
+++ b/src/regexp/syntax/parse_test.go
@@ -484,12 +484,15 @@
 	`(?P<>a)`,
 	`[a-Z]`,
 	`(?i)[a-Z]`,
-	`a{100000}`,
-	`a{100000,}`,
-	"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
-	strings.Repeat("(", 1000) + strings.Repeat(")", 1000),
-	strings.Repeat("(?:", 1000) + strings.Repeat(")*", 1000),
 	`\Q\E*`,
+	`a{100000}`,  // too much repetition
+	`a{100000,}`, // too much repetition
+	"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",    // too much repetition
+	strings.Repeat("(", 1000) + strings.Repeat(")", 1000),    // too deep
+	strings.Repeat("(?:", 1000) + strings.Repeat(")*", 1000), // too deep
+	"(" + strings.Repeat("(xx?)", 1000) + "){1000}",          // too long
+	strings.Repeat("(xx?){1000}", 1000),                      // too long
+	strings.Repeat(`\pL`, 27000),                             // too many runes
 }
 
 var onlyPerl = []string{