compiler/protogen, internal/strs, internal/impl: expose enum Go name derivation

In order to migrate v1 to wrap v2, we need a way to reproduce
the awful enum "names" that v1 used, which was the concatenation of
the proto package with the Go identifier used for the enum.

To support this:
* Move the camel case logic from compiler/protogen to internal/strs
* Add a small stub in internal/impl to expose this functionality

Change-Id: I8ff31daa9ae541e5788dc04d2e89eae1574877e4
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/191637
Reviewed-by: Damien Neil <dneil@google.com>
diff --git a/cmd/protoc-gen-go/internal_gengo/main.go b/cmd/protoc-gen-go/internal_gengo/main.go
index 0bb1f4f..dfc3102 100644
--- a/cmd/protoc-gen-go/internal_gengo/main.go
+++ b/cmd/protoc-gen-go/internal_gengo/main.go
@@ -780,14 +780,7 @@
 func fieldProtobufTagValue(field *protogen.Field) string {
 	var enumName string
 	if field.Desc.Kind() == protoreflect.EnumKind {
-		// For historical reasons, the name used in the tag is neither
-		// the protobuf full name nor the fully qualified Go identifier,
-		// but an odd mix of both.
-		enumName = field.Enum.GoIdent.GoName
-		protoPkg := string(field.Enum.Desc.ParentFile().Package())
-		if protoPkg != "" {
-			enumName = protoPkg + "." + enumName
-		}
+		enumName = protoimpl.X.LegacyEnumName(field.Enum.Desc)
 	}
 	return tag.Marshal(field.Desc, enumName)
 }
diff --git a/compiler/protogen/names.go b/compiler/protogen/names.go
deleted file mode 100644
index ae41a5a..0000000
--- a/compiler/protogen/names.go
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package protogen
-
-import (
-	"fmt"
-	"go/token"
-	"strconv"
-	"strings"
-	"unicode"
-	"unicode/utf8"
-
-	"google.golang.org/protobuf/reflect/protoreflect"
-)
-
-// A GoIdent is a Go identifier, consisting of a name and import path.
-// The name is a single identifier and may not be a dot-qualified selector.
-type GoIdent struct {
-	GoName       string
-	GoImportPath GoImportPath
-}
-
-func (id GoIdent) String() string { return fmt.Sprintf("%q.%v", id.GoImportPath, id.GoName) }
-
-// newGoIdent returns the Go identifier for a descriptor.
-func newGoIdent(f *File, d protoreflect.Descriptor) GoIdent {
-	name := strings.TrimPrefix(string(d.FullName()), string(f.Desc.Package())+".")
-	return GoIdent{
-		GoName:       camelCase(name),
-		GoImportPath: f.GoImportPath,
-	}
-}
-
-// A GoImportPath is the import path of a Go package. e.g., "google.golang.org/genproto/protobuf".
-type GoImportPath string
-
-func (p GoImportPath) String() string { return strconv.Quote(string(p)) }
-
-// Ident returns a GoIdent with s as the GoName and p as the GoImportPath.
-func (p GoImportPath) Ident(s string) GoIdent {
-	return GoIdent{GoName: s, GoImportPath: p}
-}
-
-// A GoPackageName is the name of a Go package. e.g., "protobuf".
-type GoPackageName string
-
-// cleanPackageName converts a string to a valid Go package name.
-func cleanPackageName(name string) GoPackageName {
-	return GoPackageName(cleanGoName(name))
-}
-
-// cleanGoName converts a string to a valid Go identifier.
-func cleanGoName(s string) string {
-	// Sanitize the input to the set of valid characters,
-	// which must be '_' or be in the Unicode L or N categories.
-	s = strings.Map(func(r rune) rune {
-		if unicode.IsLetter(r) || unicode.IsDigit(r) {
-			return r
-		}
-		return '_'
-	}, s)
-
-	// Prepend '_' in the event of a Go keyword conflict or if
-	// the identifier is invalid (does not start in the Unicode L category).
-	r, _ := utf8.DecodeRuneInString(s)
-	if token.Lookup(s).IsKeyword() || !unicode.IsLetter(r) {
-		return "_" + s
-	}
-	return s
-}
-
-// baseName returns the last path element of the name, with the last dotted suffix removed.
-func baseName(name string) string {
-	// First, find the last element
-	if i := strings.LastIndex(name, "/"); i >= 0 {
-		name = name[i+1:]
-	}
-	// Now drop the suffix
-	if i := strings.LastIndex(name, "."); i >= 0 {
-		name = name[:i]
-	}
-	return name
-}
-
-// camelCase converts a name to CamelCase.
-//
-// If there is an interior underscore followed by a lower case letter,
-// drop the underscore and convert the letter to upper case.
-// There is a remote possibility of this rewrite causing a name collision,
-// but it's so remote we're prepared to pretend it's nonexistent - since the
-// C++ generator lowercases names, it's extremely unlikely to have two fields
-// with different capitalizations.
-func camelCase(s string) string {
-	// Invariant: if the next letter is lower case, it must be converted
-	// to upper case.
-	// That is, we process a word at a time, where words are marked by _ or
-	// upper case letter. Digits are treated as words.
-	var b []byte
-	for i := 0; i < len(s); i++ {
-		c := s[i]
-		switch {
-		case c == '.' && i+1 < len(s) && isASCIILower(s[i+1]):
-			// Skip over '.' in ".{{lowercase}}".
-		case c == '.':
-			b = append(b, '_') // convert '.' to '_'
-		case c == '_' && (i == 0 || s[i-1] == '.'):
-			// Convert initial '_' to ensure we start with a capital letter.
-			// Do the same for '_' after '.' to match historic behavior.
-			b = append(b, 'X') // convert '_' to 'X'
-		case c == '_' && i+1 < len(s) && isASCIILower(s[i+1]):
-			// Skip over '_' in "_{{lowercase}}".
-		case isASCIIDigit(c):
-			b = append(b, c)
-		default:
-			// Assume we have a letter now - if not, it's a bogus identifier.
-			// The next word is a sequence of characters that must start upper case.
-			if isASCIILower(c) {
-				c -= 'a' - 'A' // convert lowercase to uppercase
-			}
-			b = append(b, c)
-
-			// Accept lower case sequence that follows.
-			for ; i+1 < len(s) && isASCIILower(s[i+1]); i++ {
-				b = append(b, s[i+1])
-			}
-		}
-	}
-	return string(b)
-}
-
-// Is c an ASCII lower-case letter?
-func isASCIILower(c byte) bool {
-	return 'a' <= c && c <= 'z'
-}
-
-// Is c an ASCII digit?
-func isASCIIDigit(c byte) bool {
-	return '0' <= c && c <= '9'
-}
diff --git a/compiler/protogen/names_test.go b/compiler/protogen/names_test.go
deleted file mode 100644
index 6f03cc9..0000000
--- a/compiler/protogen/names_test.go
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package protogen
-
-import "testing"
-
-func TestCamelCase(t *testing.T) {
-	tests := []struct {
-		in, want string
-	}{
-		{"", ""},
-		{"one", "One"},
-		{"one_two", "OneTwo"},
-		{"_my_field_name_2", "XMyFieldName_2"},
-		{"Something_Capped", "Something_Capped"},
-		{"my_Name", "My_Name"},
-		{"OneTwo", "OneTwo"},
-		{"_", "X"},
-		{"_a_", "XA_"},
-		{"one.two", "OneTwo"},
-		{"one.Two", "One_Two"},
-		{"one_two.three_four", "OneTwoThreeFour"},
-		{"one_two.Three_four", "OneTwo_ThreeFour"},
-		{"_one._two", "XOne_XTwo"},
-		{"SCREAMING_SNAKE_CASE", "SCREAMING_SNAKE_CASE"},
-		{"double__underscore", "Double_Underscore"},
-		{"camelCase", "CamelCase"},
-		{"go2proto", "Go2Proto"},
-		{"世界", "世界"},
-		{"x世界", "X世界"},
-		{"foo_bar世界", "FooBar世界"},
-	}
-	for _, tc := range tests {
-		if got := camelCase(tc.in); got != tc.want {
-			t.Errorf("CamelCase(%q) = %q, want %q", tc.in, got, tc.want)
-		}
-	}
-}
-
-func TestCleanGoName(t *testing.T) {
-	tests := []struct {
-		in, want string
-	}{
-		{"", "_"},
-		{"boo", "boo"},
-		{"Boo", "Boo"},
-		{"ßoo", "ßoo"},
-		{"default", "_default"},
-		{"hello", "hello"},
-		{"hello-world!!", "hello_world__"},
-		{"hello-\xde\xad\xbe\xef\x00", "hello_____"},
-		{"hello 世界", "hello_世界"},
-		{"世界", "世界"},
-	}
-	for _, tc := range tests {
-		if got := cleanGoName(tc.in); got != tc.want {
-			t.Errorf("cleanGoName(%q) = %q, want %q", tc.in, got, tc.want)
-		}
-	}
-}
diff --git a/compiler/protogen/protogen.go b/compiler/protogen/protogen.go
index ec03704..3be898d 100644
--- a/compiler/protogen/protogen.go
+++ b/compiler/protogen/protogen.go
@@ -30,6 +30,7 @@
 
 	"google.golang.org/protobuf/encoding/prototext"
 	"google.golang.org/protobuf/internal/fieldnum"
+	"google.golang.org/protobuf/internal/strs"
 	"google.golang.org/protobuf/proto"
 	"google.golang.org/protobuf/reflect/protodesc"
 	"google.golang.org/protobuf/reflect/protoreflect"
@@ -431,7 +432,7 @@
 		}
 	}
 	f.GoDescriptorIdent = GoIdent{
-		GoName:       "File_" + cleanGoName(p.GetName()),
+		GoName:       "File_" + strs.GoSanitized(p.GetName()),
 		GoImportPath: f.GoImportPath,
 	}
 	f.GeneratedFilenamePrefix = prefix
@@ -499,6 +500,8 @@
 	}
 	// A semicolon-delimited suffix delimits the import path and package name.
 	if i := strings.Index(opt, ";"); i >= 0 {
+		// TODO: The package name is explicitly provided by the .proto file.
+		// Rather than sanitizing it, we should pass it verbatim.
 		return cleanPackageName(opt[i+1:]), GoImportPath(opt[:i])
 	}
 	// The presence of a slash implies there's an import path.
@@ -756,7 +759,7 @@
 	default:
 		loc = message.Location.appendPath(fieldnum.DescriptorProto_Field, int32(desc.Index()))
 	}
-	camelCased := camelCase(string(desc.Name()))
+	camelCased := strs.GoCamelCase(string(desc.Name()))
 	var parentPrefix string
 	if message != nil {
 		parentPrefix = message.GoIdent.GoName + "_"
@@ -826,7 +829,7 @@
 
 func newOneof(gen *Plugin, f *File, message *Message, desc protoreflect.OneofDescriptor) *Oneof {
 	loc := message.Location.appendPath(fieldnum.DescriptorProto_OneofDecl, int32(desc.Index()))
-	camelCased := camelCase(string(desc.Name()))
+	camelCased := strs.GoCamelCase(string(desc.Name()))
 	parentPrefix := message.GoIdent.GoName + "_"
 	return &Oneof{
 		Desc:   desc,
@@ -860,7 +863,7 @@
 	loc := f.location(fieldnum.FileDescriptorProto_Service, int32(desc.Index()))
 	service := &Service{
 		Desc:     desc,
-		GoName:   camelCase(string(desc.Name())),
+		GoName:   strs.GoCamelCase(string(desc.Name())),
 		Location: loc,
 		Comments: f.comments[newPathKey(loc.Path)],
 	}
@@ -889,7 +892,7 @@
 	loc := service.Location.appendPath(fieldnum.ServiceDescriptorProto_Method, int32(desc.Index()))
 	method := &Method{
 		Desc:     desc,
-		GoName:   camelCase(string(desc.Name())),
+		GoName:   strs.GoCamelCase(string(desc.Name())),
 		Parent:   service,
 		Location: loc,
 		Comments: f.comments[newPathKey(loc.Path)],
@@ -1183,6 +1186,56 @@
 	return string(b), nil
 }
 
+// A GoIdent is a Go identifier, consisting of a name and import path.
+// The name is a single identifier and may not be a dot-qualified selector.
+type GoIdent struct {
+	GoName       string
+	GoImportPath GoImportPath
+}
+
+func (id GoIdent) String() string { return fmt.Sprintf("%q.%v", id.GoImportPath, id.GoName) }
+
+// newGoIdent returns the Go identifier for a descriptor.
+func newGoIdent(f *File, d protoreflect.Descriptor) GoIdent {
+	name := strings.TrimPrefix(string(d.FullName()), string(f.Desc.Package())+".")
+	return GoIdent{
+		GoName:       strs.GoCamelCase(name),
+		GoImportPath: f.GoImportPath,
+	}
+}
+
+// A GoImportPath is the import path of a Go package.
+// For example: "google.golang.org/protobuf/compiler/protogen"
+type GoImportPath string
+
+func (p GoImportPath) String() string { return strconv.Quote(string(p)) }
+
+// Ident returns a GoIdent with s as the GoName and p as the GoImportPath.
+func (p GoImportPath) Ident(s string) GoIdent {
+	return GoIdent{GoName: s, GoImportPath: p}
+}
+
+// A GoPackageName is the name of a Go package. e.g., "protobuf".
+type GoPackageName string
+
+// cleanPackageName converts a string to a valid Go package name.
+func cleanPackageName(name string) GoPackageName {
+	return GoPackageName(strs.GoSanitized(name))
+}
+
+// baseName returns the last path element of the name, with the last dotted suffix removed.
+func baseName(name string) string {
+	// First, find the last element
+	if i := strings.LastIndex(name, "/"); i >= 0 {
+		name = name[i+1:]
+	}
+	// Now drop the suffix
+	if i := strings.LastIndex(name, "."); i >= 0 {
+		name = name[:i]
+	}
+	return name
+}
+
 type pathType int
 
 const (
diff --git a/internal/impl/legacy_enum.go b/internal/impl/legacy_enum.go
index 279baa9..4ec31df 100644
--- a/internal/impl/legacy_enum.go
+++ b/internal/impl/legacy_enum.go
@@ -11,10 +11,27 @@
 	"sync"
 
 	"google.golang.org/protobuf/internal/filedesc"
+	"google.golang.org/protobuf/internal/strs"
 	"google.golang.org/protobuf/reflect/protoreflect"
 	pref "google.golang.org/protobuf/reflect/protoreflect"
 )
 
+// legacyEnumName returns the name of enums used in legacy code.
+// It is neither the protobuf full name nor the qualified Go name,
+// but rather an odd hybrid of both.
+func legacyEnumName(ed pref.EnumDescriptor) string {
+	var protoPkg string
+	enumName := string(ed.FullName())
+	if fd := ed.ParentFile(); fd != nil {
+		protoPkg = string(fd.Package())
+		enumName = strings.TrimPrefix(enumName, protoPkg+".")
+	}
+	if protoPkg == "" {
+		return strs.GoCamelCase(enumName)
+	}
+	return protoPkg + "." + strs.GoCamelCase(enumName)
+}
+
 // legacyWrapEnum wraps v as a protoreflect.Enum,
 // where v must be a int32 kind and not implement the v2 API already.
 func legacyWrapEnum(v reflect.Value) pref.Enum {
diff --git a/internal/impl/legacy_export.go b/internal/impl/legacy_export.go
index 07c16b5..29c1b01 100644
--- a/internal/impl/legacy_export.go
+++ b/internal/impl/legacy_export.go
@@ -21,6 +21,11 @@
 // These functions exist to support exported APIs in generated protobufs.
 // While these are deprecated, they cannot be removed for compatibility reasons.
 
+// LegacyEnumName returns the name of enums used in legacy code.
+func (Export) LegacyEnumName(ed pref.EnumDescriptor) string {
+	return legacyEnumName(ed)
+}
+
 // UnmarshalJSONEnum unmarshals an enum from a JSON-encoded input.
 // The input can either be a string representing the enum value by name,
 // or a number representing the enum number itself.
diff --git a/internal/impl/legacy_extension.go b/internal/impl/legacy_extension.go
index b484067..ec5420d 100644
--- a/internal/impl/legacy_extension.go
+++ b/internal/impl/legacy_extension.go
@@ -77,31 +77,10 @@
 		}
 	}
 
-	// Reconstruct the legacy enum full name, which is an odd mixture of the
-	// proto package name with the Go type name.
+	// Reconstruct the legacy enum full name.
 	var enumName string
 	if xd.Kind() == pref.EnumKind {
-		// Derive Go type name.
-		t := extType
-		if t.Kind() == reflect.Ptr || t.Kind() == reflect.Slice {
-			t = t.Elem()
-		}
-		enumName = t.Name()
-
-		// Derive the proto package name.
-		// For legacy enums, obtain the proto package from the raw descriptor.
-		var protoPkg string
-		if fd := xd.Enum().ParentFile(); fd != nil {
-			protoPkg = string(fd.Package())
-		}
-		if ed, ok := reflect.Zero(t).Interface().(enumV1); ok && protoPkg == "" {
-			b, _ := ed.EnumDescriptor()
-			protoPkg = string(legacyLoadFileDesc(b).Package())
-		}
-
-		if protoPkg != "" {
-			enumName = protoPkg + "." + enumName
-		}
+		enumName = legacyEnumName(xd.Enum())
 	}
 
 	// Derive the proto file that the extension was declared within.
diff --git a/internal/strs/strings.go b/internal/strs/strings.go
index 2208ff2..0b74e76 100644
--- a/internal/strs/strings.go
+++ b/internal/strs/strings.go
@@ -6,8 +6,10 @@
 package strs
 
 import (
+	"go/token"
 	"strings"
 	"unicode"
+	"unicode/utf8"
 
 	"google.golang.org/protobuf/internal/flags"
 	"google.golang.org/protobuf/reflect/protoreflect"
@@ -23,6 +25,68 @@
 	return fd.Syntax() == protoreflect.Proto3
 }
 
+// GoCamelCase camel-cases a protobuf name for use as a Go identifier.
+//
+// If there is an interior underscore followed by a lower case letter,
+// drop the underscore and convert the letter to upper case.
+func GoCamelCase(s string) string {
+	// Invariant: if the next letter is lower case, it must be converted
+	// to upper case.
+	// That is, we process a word at a time, where words are marked by _ or
+	// upper case letter. Digits are treated as words.
+	var b []byte
+	for i := 0; i < len(s); i++ {
+		c := s[i]
+		switch {
+		case c == '.' && i+1 < len(s) && isASCIILower(s[i+1]):
+			// Skip over '.' in ".{{lowercase}}".
+		case c == '.':
+			b = append(b, '_') // convert '.' to '_'
+		case c == '_' && (i == 0 || s[i-1] == '.'):
+			// Convert initial '_' to ensure we start with a capital letter.
+			// Do the same for '_' after '.' to match historic behavior.
+			b = append(b, 'X') // convert '_' to 'X'
+		case c == '_' && i+1 < len(s) && isASCIILower(s[i+1]):
+			// Skip over '_' in "_{{lowercase}}".
+		case isASCIIDigit(c):
+			b = append(b, c)
+		default:
+			// Assume we have a letter now - if not, it's a bogus identifier.
+			// The next word is a sequence of characters that must start upper case.
+			if isASCIILower(c) {
+				c -= 'a' - 'A' // convert lowercase to uppercase
+			}
+			b = append(b, c)
+
+			// Accept lower case sequence that follows.
+			for ; i+1 < len(s) && isASCIILower(s[i+1]); i++ {
+				b = append(b, s[i+1])
+			}
+		}
+	}
+	return string(b)
+}
+
+// GoSanitized converts a string to a valid Go identifier.
+func GoSanitized(s string) string {
+	// Sanitize the input to the set of valid characters,
+	// which must be '_' or be in the Unicode L or N categories.
+	s = strings.Map(func(r rune) rune {
+		if unicode.IsLetter(r) || unicode.IsDigit(r) {
+			return r
+		}
+		return '_'
+	}, s)
+
+	// Prepend '_' in the event of a Go keyword conflict or if
+	// the identifier is invalid (does not start in the Unicode L category).
+	r, _ := utf8.DecodeRuneInString(s)
+	if token.Lookup(s).IsKeyword() || !unicode.IsLetter(r) {
+		return "_" + s
+	}
+	return s
+}
+
 // JSONCamelCase converts a snake_case identifier to a camelCase identifier,
 // according to the protobuf JSON specification.
 func JSONCamelCase(s string) string {
@@ -31,8 +95,7 @@
 	for i := 0; i < len(s); i++ { // proto identifiers are always ASCII
 		c := s[i]
 		if c != '_' {
-			isLower := 'a' <= c && c <= 'z'
-			if wasUnderscore && isLower {
+			if wasUnderscore && isASCIILower(c) {
 				c -= 'a' - 'A' // convert to uppercase
 			}
 			b = append(b, c)
@@ -48,8 +111,7 @@
 	var b []byte
 	for i := 0; i < len(s); i++ { // proto identifiers are always ASCII
 		c := s[i]
-		isUpper := 'A' <= c && c <= 'Z'
-		if isUpper {
+		if isASCIIUpper(c) {
 			b = append(b, '_')
 			c += 'a' - 'A' // convert to lowercase
 		}
@@ -122,3 +184,13 @@
 	}
 	return s
 }
+
+func isASCIILower(c byte) bool {
+	return 'a' <= c && c <= 'z'
+}
+func isASCIIUpper(c byte) bool {
+	return 'A' <= c && c <= 'Z'
+}
+func isASCIIDigit(c byte) bool {
+	return '0' <= c && c <= '9'
+}
diff --git a/internal/strs/strings_test.go b/internal/strs/strings_test.go
index 2c4c2ad..0bb894a 100644
--- a/internal/strs/strings_test.go
+++ b/internal/strs/strings_test.go
@@ -9,6 +9,61 @@
 	"testing"
 )
 
+func TestGoCamelCase(t *testing.T) {
+	tests := []struct {
+		in, want string
+	}{
+		{"", ""},
+		{"one", "One"},
+		{"one_two", "OneTwo"},
+		{"_my_field_name_2", "XMyFieldName_2"},
+		{"Something_Capped", "Something_Capped"},
+		{"my_Name", "My_Name"},
+		{"OneTwo", "OneTwo"},
+		{"_", "X"},
+		{"_a_", "XA_"},
+		{"one.two", "OneTwo"},
+		{"one.Two", "One_Two"},
+		{"one_two.three_four", "OneTwoThreeFour"},
+		{"one_two.Three_four", "OneTwo_ThreeFour"},
+		{"_one._two", "XOne_XTwo"},
+		{"SCREAMING_SNAKE_CASE", "SCREAMING_SNAKE_CASE"},
+		{"double__underscore", "Double_Underscore"},
+		{"camelCase", "CamelCase"},
+		{"go2proto", "Go2Proto"},
+		{"世界", "世界"},
+		{"x世界", "X世界"},
+		{"foo_bar世界", "FooBar世界"},
+	}
+	for _, tc := range tests {
+		if got := GoCamelCase(tc.in); got != tc.want {
+			t.Errorf("GoCamelCase(%q) = %q, want %q", tc.in, got, tc.want)
+		}
+	}
+}
+
+func TestGoSanitized(t *testing.T) {
+	tests := []struct {
+		in, want string
+	}{
+		{"", "_"},
+		{"boo", "boo"},
+		{"Boo", "Boo"},
+		{"ßoo", "ßoo"},
+		{"default", "_default"},
+		{"hello", "hello"},
+		{"hello-world!!", "hello_world__"},
+		{"hello-\xde\xad\xbe\xef\x00", "hello_____"},
+		{"hello 世界", "hello_世界"},
+		{"世界", "世界"},
+	}
+	for _, tc := range tests {
+		if got := GoSanitized(tc.in); got != tc.want {
+			t.Errorf("GoSanitized(%q) = %q, want %q", tc.in, got, tc.want)
+		}
+	}
+}
+
 func TestName(t *testing.T) {
 	tests := []struct {
 		in                string