go/go-encode-id.cc - gofrontend - Git at Google

 // go-encode-id.cc -- Go identifier and packagepath encoding/decoding hooks

 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 #include "go-system.h"

 #include "gogo.h"
 #include "go-location.h"
 #include "go-linemap.h"
 #include "go-encode-id.h"
 #include "lex.h"

 // Return whether the character c can appear in a name that we are
 // encoding.  We only permit ASCII alphanumeric characters.

 static bool
 char_needs_encoding(char c)
 {
   switch (c)
     {
     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
     case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
     case 'Y': case 'Z':
     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
     case 'y': case 'z':
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
       return false;
     default:
       return true;
     }
 }

 // Return whether the identifier needs to be translated because it
 // contains non-ASCII characters.

 bool
 go_id_needs_encoding(const std::string& str)
 {
   for (std::string::const_iterator p = str.begin();
        p != str.end();
        ++p)
     if (char_needs_encoding(*p))
       return true;
   return false;
 }

 // Map from characters to the underscore encoding for them.

 class Special_char_code
 {
  public:
   Special_char_code();

   // Return the simple underscore encoding for C, or 0 if none.
   char
   code_for(unsigned int c) const
   {
     if (c <= 127)
       return this->codes_[c];
     return 0;
   }

  private:
   // Encodings for characters.
   char codes_[128];
 };

 // Construct the underscore encoding map.

 Special_char_code::Special_char_code()
 {
   memset(this->codes_, 0, sizeof this->codes_);
   this->codes_['_'] = '_';
   this->codes_['.'] = '0';
   this->codes_['/'] = '1';
   this->codes_['*'] = '2';
   this->codes_[','] = '3';
   this->codes_['{'] = '4';
   this->codes_['}'] = '5';
   this->codes_['['] = '6';
   this->codes_[']'] = '7';
   this->codes_['('] = '8';
   this->codes_[')'] = '9';
   this->codes_['"'] = 'a';
   this->codes_[' '] = 'b';
   this->codes_[';'] = 'c';
 }

 // The singleton Special_char_code.

 static const Special_char_code special_char_code;

 // Pull the next UTF-8 character out of P and store it in *PC.  Return
 // the number of bytes read.

 static size_t
 fetch_utf8_char(const char* p, unsigned int* pc)
 {
   unsigned char c = *p;
   if ((c & 0x80) == 0)
     {
       *pc = c;
       return 1;
     }
   size_t len = 0;
   while ((c & 0x80) != 0)
     {
       ++len;
       c <<= 1;
     }
   unsigned int rc = *p & ((1 << (7 - len)) - 1);
   for (size_t i = 1; i < len; i++)
     {
       unsigned int u = p[i];
       rc <<= 6;
       rc |= u & 0x3f;
     }
   *pc = rc;
   return len;
 }

 // Encode an identifier using assembler-friendly characters.  The
 // encoding is described in detail near the end of the long comment at
 // the start of names.cc.

 std::string
 go_encode_id(const std::string &id)
 {
   if (Lex::is_invalid_identifier(id))
     {
       go_assert(saw_errors());
       return id;
     }

   std::string ret;
   const char* p = id.c_str();
   const char* pend = p + id.length();

   // We encode a leading digit, to ensure that no identifier starts
   // with a digit.
   if (pend > p && p[0] >= '0' && p[0] <= '9')
     {
       char buf[8];
       snprintf(buf, sizeof buf, "_x%02x", p[0]);
       ret.append(buf);
       ++p;
     }

   while (p < pend)
     {
       unsigned int c;
       size_t len = fetch_utf8_char(p, &c);
       if (len == 1)
 	{
 	  if (!char_needs_encoding(c))
 	    ret.push_back(c);
 	  else
 	    {
 	      char code = special_char_code.code_for(c);
 	      if (code != 0)
 		{
 		  ret.push_back('_');
 		  ret.push_back(code);
 		}
 	      else
 		{
 		  char buf[8];
 		  snprintf(buf, sizeof buf, "_x%02x", c);
 		  ret.append(buf);
 		}
 	    }
 	}
       else
 	{
 	  char buf[16];
 	  if (c < 0x10000)
 	    snprintf(buf, sizeof buf, "_u%04x", c);
 	  else
 	    snprintf(buf, sizeof buf, "_U%08x", c);
 	  ret.append(buf);
 	}

       p += len;
     }

   return ret;
 }

 // Convert a hex digit string to a unicode codepoint. No checking
 // to insure that the hex digit is meaningful.

 static unsigned
 hex_digits_to_unicode_codepoint(const char *digits, unsigned ndig)
 {
   unsigned result = 0;
   for (unsigned i = 0; i < ndig; ++i) {
     result <<= 4;
     result |= Lex::hex_val(digits[i]);
   }
   return result;
 }

 // Decode/demangle a mangled string produced by go_encode_id(). Returns
 // empty string if demangling process fails in some way.  At the moment
 // this routine is unused; there is an equivalent routine in the runtime
 // used for demangling symbols appearing in stack traces.

 std::string
 go_decode_id(const std::string &encoded)
 {
   std::string ret;
   const char* p = encoded.c_str();
   const char* pend = p + encoded.length();
   const Location loc = Linemap::predeclared_location();

   while (p < pend)
     {
       if (*p != '_' || p + 1 == pend)
 	{
 	  ret.push_back(*p);
 	  p++;
 	  continue;
 	}

       switch (p[1])
 	{
 	case '_':
 	  ret.push_back('_');
 	  p += 2;
 	  break;
 	case '0':
 	  ret.push_back('.');
 	  p += 2;
 	  break;
 	case '1':
 	  ret.push_back('/');
 	  p += 2;
 	  break;
 	case '2':
 	  ret.push_back('*');
 	  p += 2;
 	  break;
 	case '3':
 	  ret.push_back(',');
 	  p += 2;
 	  break;
 	case '4':
 	  ret.push_back('{');
 	  p += 2;
 	  break;
 	case '5':
 	  ret.push_back('}');
 	  p += 2;
 	  break;
 	case '6':
 	  ret.push_back('[');
 	  p += 2;
 	  break;
 	case '7':
 	  ret.push_back(']');
 	  p += 2;
 	  break;
 	case '8':
 	  ret.push_back('(');
 	  p += 2;
 	  break;
 	case '9':
 	  ret.push_back(')');
 	  p += 2;
 	  break;
 	case 'a':
 	  ret.push_back('"');
 	  p += 2;
 	  break;
 	case 'b':
 	  ret.push_back(' ');
 	  p += 2;
 	  break;
 	case 'c':
 	  ret.push_back(';');
 	  p += 2;
 	  break;
         case 'x':
 	  {
 	    const char* digits = p + 2;
 	    if (strlen(digits) < 2)
 	      return "";
 	    unsigned int rune = hex_digits_to_unicode_codepoint(digits, 2);
 	    Lex::append_char(rune, true, &ret, loc);
 	    p += 4;
 	  }
 	  break;
 	case 'u':
 	  {
 	    const char* digits = p + 2;
 	    if (strlen(digits) < 4)
 	      return "";
 	    unsigned int rune = hex_digits_to_unicode_codepoint(digits, 4);
 	    Lex::append_char(rune, true, &ret, loc);
 	    p += 6;
 	  }
 	  break;
 	case 'U':
 	  {
 	    const char* digits = p + 2;
 	    if (strlen(digits) < 8)
 	      return "";
 	    unsigned int rune = hex_digits_to_unicode_codepoint(digits, 8);
 	    Lex::append_char(rune, true, &ret, loc);
 	    p += 10;
 	  }
 	  break;
 	default:
 	  return "";
 	}
     }

   return ret;
 }

 // Encode a struct field tag.  This is only used when we need to
 // create a type descriptor for an anonymous struct type with field
 // tags.  Underscore encoding will be applied to the returned string.
 // The tag will appear between curly braces, so that is all we have to
 // avoid.

 std::string
 go_mangle_struct_tag(const std::string& tag)
 {
   std::string ret;
   const char* p = tag.c_str();
   const char* pend = p + tag.length();
   while (p < pend)
     {
       unsigned int c;
       size_t len = fetch_utf8_char(p, &c);
       if (len > 1)
 	ret.append(p, len);
       else if (c != '{' && c != '}' && c != '\\')
 	ret.push_back(c);
       else
 	{
 	  ret.push_back('\\');
 	  ret.push_back(c);
 	}
       p += len;
     }
   return ret;
 }
	// go-encode-id.cc -- Go identifier and packagepath encoding/decoding hooks

	// Copyright 2016 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	#include "go-system.h"

	#include "gogo.h"
	#include "go-location.h"
	#include "go-linemap.h"
	#include "go-encode-id.h"
	#include "lex.h"

	// Return whether the character c can appear in a name that we are
	// encoding. We only permit ASCII alphanumeric characters.

	static bool
	char_needs_encoding(char c)
	{
	switch (c)
	{
	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
	case 'Y': case 'Z':
	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
	case 'y': case 'z':
	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9':
	return false;
	default:
	return true;
	}
	}

	// Return whether the identifier needs to be translated because it
	// contains non-ASCII characters.

	bool
	go_id_needs_encoding(const std::string& str)
	{
	for (std::string::const_iterator p = str.begin();
	p != str.end();
	++p)
	if (char_needs_encoding(*p))
	return true;
	return false;
	}

	// Map from characters to the underscore encoding for them.

	class Special_char_code
	{
	public:
	Special_char_code();

	// Return the simple underscore encoding for C, or 0 if none.
	char
	code_for(unsigned int c) const
	{
	if (c <= 127)
	return this->codes_[c];
	return 0;
	}

	private:
	// Encodings for characters.
	char codes_[128];
	};

	// Construct the underscore encoding map.

	Special_char_code::Special_char_code()
	{
	memset(this->codes_, 0, sizeof this->codes_);
	this->codes_['_'] = '_';
	this->codes_['.'] = '0';
	this->codes_['/'] = '1';
	this->codes_['*'] = '2';
	this->codes_[','] = '3';
	this->codes_['{'] = '4';
	this->codes_['}'] = '5';
	this->codes_['['] = '6';
	this->codes_[']'] = '7';
	this->codes_['('] = '8';
	this->codes_[')'] = '9';
	this->codes_['"'] = 'a';
	this->codes_[' '] = 'b';
	this->codes_[';'] = 'c';
	}

	// The singleton Special_char_code.

	static const Special_char_code special_char_code;

	// Pull the next UTF-8 character out of P and store it in *PC. Return
	// the number of bytes read.

	static size_t
	fetch_utf8_char(const char* p, unsigned int* pc)
	{
	unsigned char c = *p;
	if ((c & 0x80) == 0)
	{
	*pc = c;
	return 1;
	}
	size_t len = 0;
	while ((c & 0x80) != 0)
	{
	++len;
	c <<= 1;
	}
	unsigned int rc = *p & ((1 << (7 - len)) - 1);
	for (size_t i = 1; i < len; i++)
	{
	unsigned int u = p[i];
	rc <<= 6;
	rc \|= u & 0x3f;
	}
	*pc = rc;
	return len;
	}

	// Encode an identifier using assembler-friendly characters. The
	// encoding is described in detail near the end of the long comment at
	// the start of names.cc.

	std::string
	go_encode_id(const std::string &id)
	{
	if (Lex::is_invalid_identifier(id))
	{
	go_assert(saw_errors());
	return id;
	}

	std::string ret;
	const char* p = id.c_str();
	const char* pend = p + id.length();

	// We encode a leading digit, to ensure that no identifier starts
	// with a digit.
	if (pend > p && p[0] >= '0' && p[0] <= '9')
	{
	char buf[8];
	snprintf(buf, sizeof buf, "_x%02x", p[0]);
	ret.append(buf);
	++p;
	}

	while (p < pend)
	{
	unsigned int c;
	size_t len = fetch_utf8_char(p, &c);
	if (len == 1)
	{
	if (!char_needs_encoding(c))
	ret.push_back(c);
	else
	{
	char code = special_char_code.code_for(c);
	if (code != 0)
	{
	ret.push_back('_');
	ret.push_back(code);
	}
	else
	{
	char buf[8];
	snprintf(buf, sizeof buf, "_x%02x", c);
	ret.append(buf);
	}
	}
	}
	else
	{
	char buf[16];
	if (c < 0x10000)
	snprintf(buf, sizeof buf, "_u%04x", c);
	else
	snprintf(buf, sizeof buf, "_U%08x", c);
	ret.append(buf);
	}

	p += len;
	}

	return ret;
	}

	// Convert a hex digit string to a unicode codepoint. No checking
	// to insure that the hex digit is meaningful.

	static unsigned
	hex_digits_to_unicode_codepoint(const char *digits, unsigned ndig)
	{
	unsigned result = 0;
	for (unsigned i = 0; i < ndig; ++i) {
	result <<= 4;
	result \|= Lex::hex_val(digits[i]);
	}
	return result;
	}

	// Decode/demangle a mangled string produced by go_encode_id(). Returns
	// empty string if demangling process fails in some way. At the moment
	// this routine is unused; there is an equivalent routine in the runtime
	// used for demangling symbols appearing in stack traces.

	std::string
	go_decode_id(const std::string &encoded)
	{
	std::string ret;
	const char* p = encoded.c_str();
	const char* pend = p + encoded.length();
	const Location loc = Linemap::predeclared_location();

	while (p < pend)
	{
	if (*p != '_' \|\| p + 1 == pend)
	{
	ret.push_back(*p);
	p++;
	continue;
	}

	switch (p[1])
	{
	case '_':
	ret.push_back('_');
	p += 2;
	break;
	case '0':
	ret.push_back('.');
	p += 2;
	break;
	case '1':
	ret.push_back('/');
	p += 2;
	break;
	case '2':
	ret.push_back('*');
	p += 2;
	break;
	case '3':
	ret.push_back(',');
	p += 2;
	break;
	case '4':
	ret.push_back('{');
	p += 2;
	break;
	case '5':
	ret.push_back('}');
	p += 2;
	break;
	case '6':
	ret.push_back('[');
	p += 2;
	break;
	case '7':
	ret.push_back(']');
	p += 2;
	break;
	case '8':
	ret.push_back('(');
	p += 2;
	break;
	case '9':
	ret.push_back(')');
	p += 2;
	break;
	case 'a':
	ret.push_back('"');
	p += 2;
	break;
	case 'b':
	ret.push_back(' ');
	p += 2;
	break;
	case 'c':
	ret.push_back(';');
	p += 2;
	break;
	case 'x':
	{
	const char* digits = p + 2;
	if (strlen(digits) < 2)
	return "";
	unsigned int rune = hex_digits_to_unicode_codepoint(digits, 2);
	Lex::append_char(rune, true, &ret, loc);
	p += 4;
	}
	break;
	case 'u':
	{
	const char* digits = p + 2;
	if (strlen(digits) < 4)
	return "";
	unsigned int rune = hex_digits_to_unicode_codepoint(digits, 4);
	Lex::append_char(rune, true, &ret, loc);
	p += 6;
	}
	break;
	case 'U':
	{
	const char* digits = p + 2;
	if (strlen(digits) < 8)
	return "";
	unsigned int rune = hex_digits_to_unicode_codepoint(digits, 8);
	Lex::append_char(rune, true, &ret, loc);
	p += 10;
	}
	break;
	default:
	return "";
	}
	}

	return ret;
	}

	// Encode a struct field tag. This is only used when we need to
	// create a type descriptor for an anonymous struct type with field
	// tags. Underscore encoding will be applied to the returned string.
	// The tag will appear between curly braces, so that is all we have to
	// avoid.

	std::string
	go_mangle_struct_tag(const std::string& tag)
	{
	std::string ret;
	const char* p = tag.c_str();
	const char* pend = p + tag.length();
	while (p < pend)
	{
	unsigned int c;
	size_t len = fetch_utf8_char(p, &c);
	if (len > 1)
	ret.append(p, len);
	else if (c != '{' && c != '}' && c != '\\')
	ret.push_back(c);
	else
	{
	ret.push_back('\\');
	ret.push_back(c);
	}
	p += len;
	}
	return ret;
	}