go/scanner: accept new Go2 number literals

This CL introduces go/scanner support for the new binary and octal integer
literals, hexadecimal floats, and digit separators for all number literals.
The new code is closely mirroring the respective code for number literals in
cmd/compile/internal/syntax/scanner.go.

R=Go1.13

Updates #12711.
Updates #19308.
Updates #28493.
Updates #29008.

Change-Id: I5315c6aaa7cfc41a618296be20e3acd5114d6b3c
Reviewed-on: https://go-review.googlesource.com/c/159997
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
diff --git a/src/go/scanner/scanner.go b/src/go/scanner/scanner.go
index e78abf1..9e85d48 100644
--- a/src/go/scanner/scanner.go
+++ b/src/go/scanner/scanner.go
@@ -150,6 +150,10 @@
 	s.ErrorCount++
 }
 
+func (s *Scanner) errorf(offs int, format string, args ...interface{}) {
+	s.error(offs, fmt.Sprintf(format, args...))
+}
+
 func (s *Scanner) scanComment() string {
 	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
 	offs := s.offset - 1 // position of initial '/'
@@ -336,11 +340,11 @@
 }
 
 func isLetter(ch rune) bool {
-	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
+	return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
 }
 
 func isDigit(ch rune) bool {
-	return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
+	return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
 }
 
 func (s *Scanner) scanIdentifier() string {
@@ -355,95 +359,188 @@
 	switch {
 	case '0' <= ch && ch <= '9':
 		return int(ch - '0')
-	case 'a' <= ch && ch <= 'f':
-		return int(ch - 'a' + 10)
-	case 'A' <= ch && ch <= 'F':
-		return int(ch - 'A' + 10)
+	case 'a' <= lower(ch) && lower(ch) <= 'f':
+		return int(lower(ch) - 'a' + 10)
 	}
 	return 16 // larger than any legal digit val
 }
 
-func (s *Scanner) scanMantissa(base int) {
-	for digitVal(s.ch) < base {
-		s.next()
+func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
+func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
+func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
+
+// digits accepts the sequence { digit | '_' }.
+// If base <= 10, digits accepts any decimal digit but records
+// the offset (relative to the source start) of a digit >= base
+// in *invalid, if *invalid < 0.
+// digits returns a bitset describing whether the sequence contained
+// digits (bit 0 is set), or separators '_' (bit 1 is set).
+func (s *Scanner) digits(base int, invalid *int) (digsep int) {
+	if base <= 10 {
+		max := rune('0' + base)
+		for isDecimal(s.ch) || s.ch == '_' {
+			ds := 1
+			if s.ch == '_' {
+				ds = 2
+			} else if s.ch >= max && *invalid < 0 {
+				*invalid = int(s.offset) // record invalid rune offset
+			}
+			digsep |= ds
+			s.next()
+		}
+	} else {
+		for isHex(s.ch) || s.ch == '_' {
+			ds := 1
+			if s.ch == '_' {
+				ds = 2
+			}
+			digsep |= ds
+			s.next()
+		}
 	}
+	return
 }
 
-func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
-	// digitVal(s.ch) < 10
+func (s *Scanner) scanNumber() (token.Token, string) {
 	offs := s.offset
-	tok := token.INT
+	tok := token.ILLEGAL
 
-	if seenDecimalPoint {
-		offs--
-		tok = token.FLOAT
-		s.scanMantissa(10)
-		goto exponent
-	}
+	base := 10        // number base
+	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
+	digsep := 0       // bit 0: digit present, bit 1: '_' present
+	invalid := -1     // index of invalid digit in literal, or < 0
 
-	if s.ch == '0' {
-		// int or float
-		offs := s.offset
-		s.next()
-		if s.ch == 'x' || s.ch == 'X' {
-			// hexadecimal int
+	// integer part
+	if s.ch != '.' {
+		tok = token.INT
+		if s.ch == '0' {
 			s.next()
-			s.scanMantissa(16)
-			if s.offset-offs <= 2 {
-				// only scanned "0x" or "0X"
-				s.error(offs, "illegal hexadecimal number")
-			}
-		} else {
-			// octal int or float
-			seenDecimalDigit := false
-			s.scanMantissa(8)
-			if s.ch == '8' || s.ch == '9' {
-				// illegal octal int or float
-				seenDecimalDigit = true
-				s.scanMantissa(10)
-			}
-			if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' {
-				goto fraction
-			}
-			// octal int
-			if seenDecimalDigit {
-				s.error(offs, "illegal octal number")
+			switch lower(s.ch) {
+			case 'x':
+				s.next()
+				base, prefix = 16, 'x'
+			case 'o':
+				s.next()
+				base, prefix = 8, 'o'
+			case 'b':
+				s.next()
+				base, prefix = 2, 'b'
+			default:
+				base, prefix = 8, '0'
+				digsep = 1 // leading 0
 			}
 		}
-		goto exit
+		digsep |= s.digits(base, &invalid)
 	}
 
-	// decimal int or float
-	s.scanMantissa(10)
-
-fraction:
+	// fractional part
 	if s.ch == '.' {
 		tok = token.FLOAT
+		if prefix == 'o' || prefix == 'b' {
+			s.error(s.offset, "invalid radix point in "+litname(prefix))
+		}
 		s.next()
-		s.scanMantissa(10)
+		digsep |= s.digits(base, &invalid)
 	}
 
-exponent:
-	if s.ch == 'e' || s.ch == 'E' {
-		tok = token.FLOAT
+	if digsep&1 == 0 {
+		s.error(s.offset, litname(prefix)+" has no digits")
+	}
+
+	// exponent
+	if e := lower(s.ch); e == 'e' || e == 'p' {
+		switch {
+		case e == 'e' && prefix != 0 && prefix != '0':
+			s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
+		case e == 'p' && prefix != 'x':
+			s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
+		}
 		s.next()
-		if s.ch == '-' || s.ch == '+' {
+		tok = token.FLOAT
+		if s.ch == '+' || s.ch == '-' {
 			s.next()
 		}
-		if digitVal(s.ch) < 10 {
-			s.scanMantissa(10)
-		} else {
-			s.error(offs, "illegal floating-point exponent")
+		ds := s.digits(10, nil)
+		digsep |= ds
+		if ds&1 == 0 {
+			s.error(s.offset, "exponent has no digits")
 		}
+	} else if prefix == 'x' && tok == token.FLOAT {
+		s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
 	}
 
+	// suffix 'i'
 	if s.ch == 'i' {
 		tok = token.IMAG
+		if prefix != 0 && prefix != '0' {
+			s.error(s.offset, "invalid suffix 'i' on "+litname(prefix))
+		}
 		s.next()
 	}
 
-exit:
-	return tok, string(s.src[offs:s.offset])
+	lit := string(s.src[offs:s.offset])
+	if tok == token.INT && invalid >= 0 {
+		s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
+	}
+	if digsep&2 != 0 {
+		if i := invalidSep(lit); i >= 0 {
+			s.error(offs+i, "'_' must separate successive digits")
+		}
+	}
+
+	return tok, lit
+}
+
+func litname(prefix rune) string {
+	switch prefix {
+	case 'x':
+		return "hexadecimal literal"
+	case 'o', '0':
+		return "octal literal"
+	case 'b':
+		return "binary literal"
+	}
+	return "decimal literal"
+}
+
+// invalidSep returns the index of the first invalid separator in x, or -1.
+func invalidSep(x string) int {
+	x1 := ' ' // prefix char, we only care if it's 'x'
+	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
+	i := 0
+
+	// a prefix counts as a digit
+	if len(x) >= 2 && x[0] == '0' {
+		x1 = lower(rune(x[1]))
+		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
+			d = '0'
+			i = 2
+		}
+	}
+
+	// mantissa and exponent
+	for ; i < len(x); i++ {
+		p := d // previous digit
+		d = rune(x[i])
+		switch {
+		case d == '_':
+			if p != '0' {
+				return i
+			}
+		case isDecimal(d) || x1 == 'x' && isHex(d):
+			d = '0'
+		default:
+			if p == '_' {
+				return i - 1
+			}
+			d = '.'
+		}
+	}
+	if d == '_' {
+		return len(x) - 1
+	}
+
+	return -1
 }
 
 // scanEscape parses an escape sequence where rune is the accepted
@@ -708,9 +805,9 @@
 			insertSemi = true
 			tok = token.IDENT
 		}
-	case '0' <= ch && ch <= '9':
+	case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
 		insertSemi = true
-		tok, lit = s.scanNumber(false)
+		tok, lit = s.scanNumber()
 	default:
 		s.next() // always make progress
 		switch ch {
@@ -741,16 +838,12 @@
 		case ':':
 			tok = s.switch2(token.COLON, token.DEFINE)
 		case '.':
-			if '0' <= s.ch && s.ch <= '9' {
-				insertSemi = true
-				tok, lit = s.scanNumber(true)
-			} else {
-				tok = token.PERIOD
-				if s.ch == '.' && s.peek() == '.' {
-					s.next()
-					s.next() // consume last '.'
-					tok = token.ELLIPSIS
-				}
+			// fractions starting with a '.' are handled by outer switch
+			tok = token.PERIOD
+			if s.ch == '.' && s.peek() == '.' {
+				s.next()
+				s.next() // consume last '.'
+				tok = token.ELLIPSIS
 			}
 		case ',':
 			tok = token.COMMA
@@ -835,7 +928,7 @@
 		default:
 			// next reports unexpected BOMs - don't repeat
 			if ch != bom {
-				s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
+				s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
 			}
 			insertSemi = s.insertSemi // preserve insertSemi info
 			tok = token.ILLEGAL
diff --git a/src/go/scanner/scanner_test.go b/src/go/scanner/scanner_test.go
index 36c9622..1d6865f 100644
--- a/src/go/scanner/scanner_test.go
+++ b/src/go/scanner/scanner_test.go
@@ -10,6 +10,7 @@
 	"os"
 	"path/filepath"
 	"runtime"
+	"strings"
 	"testing"
 )
 
@@ -802,11 +803,10 @@
 	{"078.", token.FLOAT, 0, "078.", ""},
 	{"07801234567.", token.FLOAT, 0, "07801234567.", ""},
 	{"078e0", token.FLOAT, 0, "078e0", ""},
-	{"0E", token.FLOAT, 0, "0E", "illegal floating-point exponent"}, // issue 17621
-	{"078", token.INT, 0, "078", "illegal octal number"},
-	{"07800000009", token.INT, 0, "07800000009", "illegal octal number"},
-	{"0x", token.INT, 0, "0x", "illegal hexadecimal number"},
-	{"0X", token.INT, 0, "0X", "illegal hexadecimal number"},
+	{"0E", token.FLOAT, 2, "0E", "exponent has no digits"}, // issue 17621
+	{"078", token.INT, 2, "078", "invalid digit '8' in octal literal"},
+	{"07090000008", token.INT, 3, "07090000008", "invalid digit '9' in octal literal"},
+	{"0x", token.INT, 2, "0x", "hexadecimal literal has no digits"},
 	{"\"abc\x00def\"", token.STRING, 4, "\"abc\x00def\"", "illegal character NUL"},
 	{"\"abc\x80def\"", token.STRING, 4, "\"abc\x80def\"", "illegal UTF-8 encoding"},
 	{"\ufeff\ufeff", token.ILLEGAL, 3, "\ufeff\ufeff", "illegal byte order mark"},                        // only first BOM is ignored
@@ -912,3 +912,199 @@
 		}
 	}
 }
+
+func TestNumbers(t *testing.T) {
+	for _, test := range []struct {
+		tok              token.Token
+		src, tokens, err string
+	}{
+		// binaries
+		{token.INT, "0b0", "0b0", ""},
+		{token.INT, "0b1010", "0b1010", ""},
+		{token.INT, "0B1110", "0B1110", ""},
+
+		{token.INT, "0b", "0b", "binary literal has no digits"},
+		{token.INT, "0b0190", "0b0190", "invalid digit '9' in binary literal"},
+		{token.INT, "0b01a0", "0b01 a0", ""}, // only accept 0-9
+
+		// binary floats and imaginaries (invalid)
+		{token.FLOAT, "0b.", "0b.", "invalid radix point in binary literal"},
+		{token.FLOAT, "0b.1", "0b.1", "invalid radix point in binary literal"},
+		{token.FLOAT, "0b1.0", "0b1.0", "invalid radix point in binary literal"},
+		{token.FLOAT, "0b1e10", "0b1e10", "'e' exponent requires decimal mantissa"},
+		{token.FLOAT, "0b1P-1", "0b1P-1", "'P' exponent requires hexadecimal mantissa"},
+		{token.IMAG, "0b10i", "0b10i", "invalid suffix 'i' on binary literal"},
+
+		// octals
+		{token.INT, "0o0", "0o0", ""},
+		{token.INT, "0o1234", "0o1234", ""},
+		{token.INT, "0O1234", "0O1234", ""},
+
+		{token.INT, "0o", "0o", "octal literal has no digits"},
+		{token.INT, "0o8123", "0o8123", "invalid digit '8' in octal literal"},
+		{token.INT, "0o1293", "0o1293", "invalid digit '9' in octal literal"},
+		{token.INT, "0o12a3", "0o12 a3", ""}, // only accept 0-9
+
+		// octal floats and imaginaries (invalid)
+		{token.FLOAT, "0o.", "0o.", "invalid radix point in octal literal"},
+		{token.FLOAT, "0o.2", "0o.2", "invalid radix point in octal literal"},
+		{token.FLOAT, "0o1.2", "0o1.2", "invalid radix point in octal literal"},
+		{token.FLOAT, "0o1E+2", "0o1E+2", "'E' exponent requires decimal mantissa"},
+		{token.FLOAT, "0o1p10", "0o1p10", "'p' exponent requires hexadecimal mantissa"},
+		{token.IMAG, "0o10i", "0o10i", "invalid suffix 'i' on octal literal"},
+
+		// 0-octals
+		{token.INT, "0", "0", ""},
+		{token.INT, "0123", "0123", ""},
+
+		{token.INT, "08123", "08123", "invalid digit '8' in octal literal"},
+		{token.INT, "01293", "01293", "invalid digit '9' in octal literal"},
+		{token.INT, "0F.", "0 F .", ""}, // only accept 0-9
+		{token.INT, "0123F.", "0123 F .", ""},
+		{token.INT, "0123456x", "0123456 x", ""},
+
+		// decimals
+		{token.INT, "1", "1", ""},
+		{token.INT, "1234", "1234", ""},
+
+		{token.INT, "1f", "1 f", ""}, // only accept 0-9
+
+		// decimal floats
+		{token.FLOAT, "0.", "0.", ""},
+		{token.FLOAT, "123.", "123.", ""},
+		{token.FLOAT, "0123.", "0123.", ""},
+
+		{token.FLOAT, ".0", ".0", ""},
+		{token.FLOAT, ".123", ".123", ""},
+		{token.FLOAT, ".0123", ".0123", ""},
+
+		{token.FLOAT, "0.0", "0.0", ""},
+		{token.FLOAT, "123.123", "123.123", ""},
+		{token.FLOAT, "0123.0123", "0123.0123", ""},
+
+		{token.FLOAT, "0e0", "0e0", ""},
+		{token.FLOAT, "123e+0", "123e+0", ""},
+		{token.FLOAT, "0123E-1", "0123E-1", ""},
+
+		{token.FLOAT, "0.e+1", "0.e+1", ""},
+		{token.FLOAT, "123.E-10", "123.E-10", ""},
+		{token.FLOAT, "0123.e123", "0123.e123", ""},
+
+		{token.FLOAT, ".0e-1", ".0e-1", ""},
+		{token.FLOAT, ".123E+10", ".123E+10", ""},
+		{token.FLOAT, ".0123E123", ".0123E123", ""},
+
+		{token.FLOAT, "0.0e1", "0.0e1", ""},
+		{token.FLOAT, "123.123E-10", "123.123E-10", ""},
+		{token.FLOAT, "0123.0123e+456", "0123.0123e+456", ""},
+
+		{token.FLOAT, "0e", "0e", "exponent has no digits"},
+		{token.FLOAT, "0E+", "0E+", "exponent has no digits"},
+		{token.FLOAT, "1e+f", "1e+ f", "exponent has no digits"},
+		{token.FLOAT, "0p0", "0p0", "'p' exponent requires hexadecimal mantissa"},
+		{token.FLOAT, "1.0P-1", "1.0P-1", "'P' exponent requires hexadecimal mantissa"},
+
+		// decimal imaginaries
+		{token.IMAG, "0.i", "0.i", ""},
+		{token.IMAG, ".123i", ".123i", ""},
+		{token.IMAG, "123.123i", "123.123i", ""},
+		{token.IMAG, "123e+0i", "123e+0i", ""},
+		{token.IMAG, "123.E-10i", "123.E-10i", ""},
+		{token.IMAG, ".123E+10i", ".123E+10i", ""},
+
+		// hexadecimals
+		{token.INT, "0x0", "0x0", ""},
+		{token.INT, "0x1234", "0x1234", ""},
+		{token.INT, "0xcafef00d", "0xcafef00d", ""},
+		{token.INT, "0XCAFEF00D", "0XCAFEF00D", ""},
+
+		{token.INT, "0x", "0x", "hexadecimal literal has no digits"},
+		{token.INT, "0x1g", "0x1 g", ""},
+
+		// hexadecimal floats
+		{token.FLOAT, "0x0p0", "0x0p0", ""},
+		{token.FLOAT, "0x12efp-123", "0x12efp-123", ""},
+		{token.FLOAT, "0xABCD.p+0", "0xABCD.p+0", ""},
+		{token.FLOAT, "0x.0189P-0", "0x.0189P-0", ""},
+		{token.FLOAT, "0x1.ffffp+1023", "0x1.ffffp+1023", ""},
+
+		{token.FLOAT, "0x.", "0x.", "hexadecimal literal has no digits"},
+		{token.FLOAT, "0x0.", "0x0.", "hexadecimal mantissa requires a 'p' exponent"},
+		{token.FLOAT, "0x.0", "0x.0", "hexadecimal mantissa requires a 'p' exponent"},
+		{token.FLOAT, "0x1.1", "0x1.1", "hexadecimal mantissa requires a 'p' exponent"},
+		{token.FLOAT, "0x1.1e0", "0x1.1e0", "hexadecimal mantissa requires a 'p' exponent"},
+		{token.FLOAT, "0x1.2gp1a", "0x1.2 gp1a", "hexadecimal mantissa requires a 'p' exponent"},
+		{token.FLOAT, "0x0p", "0x0p", "exponent has no digits"},
+		{token.FLOAT, "0xeP-", "0xeP-", "exponent has no digits"},
+		{token.FLOAT, "0x1234PAB", "0x1234P AB", "exponent has no digits"},
+		{token.FLOAT, "0x1.2p1a", "0x1.2p1 a", ""},
+
+		// hexadecimal imaginaries (invalid)
+		{token.IMAG, "0xf00i", "0xf00i", "invalid suffix 'i' on hexadecimal literal"},
+		{token.IMAG, "0xf00.bap+12i", "0xf00.bap+12i", "invalid suffix 'i' on hexadecimal literal"},
+
+		// separators
+		{token.INT, "0b_1000_0001", "0b_1000_0001", ""},
+		{token.INT, "0o_600", "0o_600", ""},
+		{token.INT, "0_466", "0_466", ""},
+		{token.INT, "1_000", "1_000", ""},
+		{token.FLOAT, "1_000.000_1", "1_000.000_1", ""},
+		{token.IMAG, "10e+1_2_3i", "10e+1_2_3i", ""},
+		{token.INT, "0x_f00d", "0x_f00d", ""},
+		{token.FLOAT, "0x_f00d.0p1_2", "0x_f00d.0p1_2", ""},
+
+		{token.INT, "0b__1000", "0b__1000", "'_' must separate successive digits"},
+		{token.INT, "0o60___0", "0o60___0", "'_' must separate successive digits"},
+		{token.INT, "0466_", "0466_", "'_' must separate successive digits"},
+		{token.FLOAT, "1_.", "1_.", "'_' must separate successive digits"},
+		{token.FLOAT, "0._1", "0._1", "'_' must separate successive digits"},
+		{token.FLOAT, "2.7_e0", "2.7_e0", "'_' must separate successive digits"},
+		{token.IMAG, "10e+12_i", "10e+12_i", "'_' must separate successive digits"},
+		{token.INT, "0x___0", "0x___0", "'_' must separate successive digits"},
+		{token.FLOAT, "0x1.0_p0", "0x1.0_p0", "'_' must separate successive digits"},
+	} {
+		var s Scanner
+		var err string
+		s.Init(fset.AddFile("", fset.Base(), len(test.src)), []byte(test.src), func(_ token.Position, msg string) {
+			if err == "" {
+				err = msg
+			}
+		}, 0)
+		for i, want := range strings.Split(test.tokens, " ") {
+			err = ""
+			_, tok, lit := s.Scan()
+
+			// compute lit where for tokens where lit is not defined
+			switch tok {
+			case token.PERIOD:
+				lit = "."
+			case token.ADD:
+				lit = "+"
+			case token.SUB:
+				lit = "-"
+			}
+
+			if i == 0 {
+				if tok != test.tok {
+					t.Errorf("%q: got token %s; want %s", test.src, tok, test.tok)
+				}
+				if err != test.err {
+					t.Errorf("%q: got error %q; want %q", test.src, err, test.err)
+				}
+			}
+
+			if lit != want {
+				t.Errorf("%q: got literal %q (%s); want %s", test.src, lit, tok, want)
+			}
+		}
+
+		// make sure we read all
+		_, tok, _ := s.Scan()
+		if tok == token.SEMICOLON {
+			_, tok, _ = s.Scan()
+		}
+		if tok != token.EOF {
+			t.Errorf("%q: got %s; want EOF", test.src, tok)
+		}
+	}
+}