go/go-encode-id.cc - gofrontend - Git at Google

 // go-encode-id.cc -- Go identifier encoding hooks

 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 #include "go-system.h"

 #include "gogo.h"
 #include "go-location.h"
 #include "go-linemap.h"
 #include "go-encode-id.h"
 #include "lex.h"

 // Return whether the character c is OK to use in the assembler.  We
 // only permit ASCII alphanumeric characters, underscore, and dot.

 static bool
 char_needs_encoding(char c)
 {
   switch (c)
     {
     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
     case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
     case 'Y': case 'Z':
     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
     case 'y': case 'z':
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
     case '_': case '.':
       return false;
     default:
       return true;
     }
 }

 // Return whether the identifier needs to be translated because it
 // contains non-ASCII characters.

 bool
 go_id_needs_encoding(const std::string& str)
 {
   for (std::string::const_iterator p = str.begin();
        p != str.end();
        ++p)
     if (char_needs_encoding(*p))
       return true;
   return false;
 }

 // Pull the next UTF-8 character out of P and store it in *PC.  Return
 // the number of bytes read.

 static size_t
 fetch_utf8_char(const char* p, unsigned int* pc)
 {
   unsigned char c = *p;
   if ((c & 0x80) == 0)
     {
       *pc = c;
       return 1;
     }
   size_t len = 0;
   while ((c & 0x80) != 0)
     {
       ++len;
       c <<= 1;
     }
   unsigned int rc = *p & ((1 << (7 - len)) - 1);
   for (size_t i = 1; i < len; i++)
     {
       unsigned int u = p[i];
       rc <<= 6;
       rc |= u & 0x3f;
     }
   *pc = rc;
   return len;
 }

 // Encode an identifier using ASCII characters.  The encoding is
 // described in detail near the end of the long comment at the start
 // of names.cc.  Short version: translate all non-ASCII-alphanumeric
 // characters into ..uXXXX or ..UXXXXXXXX.

 std::string
 go_encode_id(const std::string &id)
 {
   if (Lex::is_invalid_identifier(id))
     {
       go_assert(saw_errors());
       return id;
     }

   // The encoding is only unambiguous if the input string does not
   // contain ..u or ..U.
   go_assert(id.find("..u") == std::string::npos);
   go_assert(id.find("..U") == std::string::npos);

   std::string ret;
   const char* p = id.c_str();
   const char* pend = p + id.length();

   // A leading ".0" is a space introduced before a mangled type name
   // that starts with a 'u' or 'U', to avoid confusion with the
   // mangling used here.  We don't need a leading ".0", and we don't
   // want symbols that start with '.', so remove it.
   if (p[0] == '.' && p[1] == '0')
     p += 2;

   while (p < pend)
     {
       unsigned int c;
       size_t len = fetch_utf8_char(p, &c);
       if (len == 1)
 	{
 	  // At this point we should only be seeing alphanumerics or
 	  // underscore or dot.
 	  go_assert(!char_needs_encoding(c));
 	  ret += c;
 	}
       else
 	{
 	  char buf[16];
 	  if (c < 0x10000)
 	    snprintf(buf, sizeof buf, "..u%04x", c);
 	  else
 	    snprintf(buf, sizeof buf, "..U%08x", c);

 	  // We don't want a symbol to start with '.', so add a prefix
 	  // if needed.
 	  if (ret.empty())
 	    ret += '_';

 	  ret += buf;
 	}
       p += len;
     }
   return ret;
 }

 std::string
 go_selectively_encode_id(const std::string &id)
 {
   if (go_id_needs_encoding(id))
     return go_encode_id(id);
   return std::string();
 }

 // Encode a struct field tag.  This is only used when we need to
 // create a type descriptor for an anonymous struct type with field
 // tags.  This mangling is applied before go_encode_id.  We skip
 // alphanumerics and underscore, replace every other single byte
 // character with .xNN, and leave larger UTF-8 characters for
 // go_encode_id.

 std::string
 go_mangle_struct_tag(const std::string& tag)
 {
   std::string ret;
   const char* p = tag.c_str();
   const char* pend = p + tag.length();
   while (p < pend)
     {
       unsigned int c;
       size_t len = fetch_utf8_char(p, &c);
       if (len > 1)
 	ret.append(p, len);
       else if (!char_needs_encoding(c) && c != '.')
 	ret += c;
       else
 	{
 	  char buf[16];
 	  snprintf(buf, sizeof buf, ".x%02x", c);
 	  ret += buf;
 	}
       p += len;
     }
   return ret;
 }
	// go-encode-id.cc -- Go identifier encoding hooks

	// Copyright 2016 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	#include "go-system.h"

	#include "gogo.h"
	#include "go-location.h"
	#include "go-linemap.h"
	#include "go-encode-id.h"
	#include "lex.h"

	// Return whether the character c is OK to use in the assembler. We
	// only permit ASCII alphanumeric characters, underscore, and dot.

	static bool
	char_needs_encoding(char c)
	{
	switch (c)
	{
	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
	case 'Y': case 'Z':
	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
	case 'y': case 'z':
	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9':
	case '_': case '.':
	return false;
	default:
	return true;
	}
	}

	// Return whether the identifier needs to be translated because it
	// contains non-ASCII characters.

	bool
	go_id_needs_encoding(const std::string& str)
	{
	for (std::string::const_iterator p = str.begin();
	p != str.end();
	++p)
	if (char_needs_encoding(*p))
	return true;
	return false;
	}

	// Pull the next UTF-8 character out of P and store it in *PC. Return
	// the number of bytes read.

	static size_t
	fetch_utf8_char(const char* p, unsigned int* pc)
	{
	unsigned char c = *p;
	if ((c & 0x80) == 0)
	{
	*pc = c;
	return 1;
	}
	size_t len = 0;
	while ((c & 0x80) != 0)
	{
	++len;
	c <<= 1;
	}
	unsigned int rc = *p & ((1 << (7 - len)) - 1);
	for (size_t i = 1; i < len; i++)
	{
	unsigned int u = p[i];
	rc <<= 6;
	rc \|= u & 0x3f;
	}
	*pc = rc;
	return len;
	}

	// Encode an identifier using ASCII characters. The encoding is
	// described in detail near the end of the long comment at the start
	// of names.cc. Short version: translate all non-ASCII-alphanumeric
	// characters into ..uXXXX or ..UXXXXXXXX.

	std::string
	go_encode_id(const std::string &id)
	{
	if (Lex::is_invalid_identifier(id))
	{
	go_assert(saw_errors());
	return id;
	}

	// The encoding is only unambiguous if the input string does not
	// contain ..u or ..U.
	go_assert(id.find("..u") == std::string::npos);
	go_assert(id.find("..U") == std::string::npos);

	std::string ret;
	const char* p = id.c_str();
	const char* pend = p + id.length();

	// A leading ".0" is a space introduced before a mangled type name
	// that starts with a 'u' or 'U', to avoid confusion with the
	// mangling used here. We don't need a leading ".0", and we don't
	// want symbols that start with '.', so remove it.
	if (p[0] == '.' && p[1] == '0')
	p += 2;

	while (p < pend)
	{
	unsigned int c;
	size_t len = fetch_utf8_char(p, &c);
	if (len == 1)
	{
	// At this point we should only be seeing alphanumerics or
	// underscore or dot.
	go_assert(!char_needs_encoding(c));
	ret += c;
	}
	else
	{
	char buf[16];
	if (c < 0x10000)
	snprintf(buf, sizeof buf, "..u%04x", c);
	else
	snprintf(buf, sizeof buf, "..U%08x", c);

	// We don't want a symbol to start with '.', so add a prefix
	// if needed.
	if (ret.empty())
	ret += '_';

	ret += buf;
	}
	p += len;
	}
	return ret;
	}

	std::string
	go_selectively_encode_id(const std::string &id)
	{
	if (go_id_needs_encoding(id))
	return go_encode_id(id);
	return std::string();
	}

	// Encode a struct field tag. This is only used when we need to
	// create a type descriptor for an anonymous struct type with field
	// tags. This mangling is applied before go_encode_id. We skip
	// alphanumerics and underscore, replace every other single byte
	// character with .xNN, and leave larger UTF-8 characters for
	// go_encode_id.

	std::string
	go_mangle_struct_tag(const std::string& tag)
	{
	std::string ret;
	const char* p = tag.c_str();
	const char* pend = p + tag.length();
	while (p < pend)
	{
	unsigned int c;
	size_t len = fetch_utf8_char(p, &c);
	if (len > 1)
	ret.append(p, len);
	else if (!char_needs_encoding(c) && c != '.')
	ret += c;
	else
	{
	char buf[16];
	snprintf(buf, sizeof buf, ".x%02x", c);
	ret += buf;
	}
	p += len;
	}
	return ret;
	}