| // go-encode-id.cc -- Go identifier and packagepath encoding/decoding hooks |
| |
| // Copyright 2016 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| #include "go-system.h" |
| |
| #include "gogo.h" |
| #include "go-location.h" |
| #include "go-linemap.h" |
| #include "go-encode-id.h" |
| #include "lex.h" |
| |
| // Return whether the character c can appear in a name that we are |
| // encoding. We only permit ASCII alphanumeric characters. |
| |
| static bool |
| char_needs_encoding(char c) |
| { |
| switch (c) |
| { |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
| case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': |
| case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': |
| case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': |
| case 'Y': case 'Z': |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
| case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': |
| case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': |
| case 's': case 't': case 'u': case 'v': case 'w': case 'x': |
| case 'y': case 'z': |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| return false; |
| default: |
| return true; |
| } |
| } |
| |
| // Return whether the identifier needs to be translated because it |
| // contains non-ASCII characters. |
| |
| bool |
| go_id_needs_encoding(const std::string& str) |
| { |
| for (std::string::const_iterator p = str.begin(); |
| p != str.end(); |
| ++p) |
| if (char_needs_encoding(*p)) |
| return true; |
| return false; |
| } |
| |
| // Map from characters to the underscore encoding for them. |
| |
| class Special_char_code |
| { |
| public: |
| Special_char_code(); |
| |
| // Return the simple underscore encoding for C, or 0 if none. |
| char |
| code_for(unsigned int c) const |
| { |
| if (c <= 127) |
| return this->codes_[c]; |
| return 0; |
| } |
| |
| private: |
| // Encodings for characters. |
| char codes_[128]; |
| }; |
| |
| // Construct the underscore encoding map. |
| |
| Special_char_code::Special_char_code() |
| { |
| memset(this->codes_, 0, sizeof this->codes_); |
| this->codes_['_'] = '_'; |
| this->codes_['.'] = '0'; |
| this->codes_['/'] = '1'; |
| this->codes_['*'] = '2'; |
| this->codes_[','] = '3'; |
| this->codes_['{'] = '4'; |
| this->codes_['}'] = '5'; |
| this->codes_['['] = '6'; |
| this->codes_[']'] = '7'; |
| this->codes_['('] = '8'; |
| this->codes_[')'] = '9'; |
| this->codes_['"'] = 'a'; |
| this->codes_[' '] = 'b'; |
| this->codes_[';'] = 'c'; |
| } |
| |
| // The singleton Special_char_code. |
| |
| static const Special_char_code special_char_code; |
| |
| // Pull the next UTF-8 character out of P and store it in *PC. Return |
| // the number of bytes read. |
| |
| static size_t |
| fetch_utf8_char(const char* p, unsigned int* pc) |
| { |
| unsigned char c = *p; |
| if ((c & 0x80) == 0) |
| { |
| *pc = c; |
| return 1; |
| } |
| size_t len = 0; |
| while ((c & 0x80) != 0) |
| { |
| ++len; |
| c <<= 1; |
| } |
| unsigned int rc = *p & ((1 << (7 - len)) - 1); |
| for (size_t i = 1; i < len; i++) |
| { |
| unsigned int u = p[i]; |
| rc <<= 6; |
| rc |= u & 0x3f; |
| } |
| *pc = rc; |
| return len; |
| } |
| |
| // Encode an identifier using assembler-friendly characters. The |
| // encoding is described in detail near the end of the long comment at |
| // the start of names.cc. |
| |
| std::string |
| go_encode_id(const std::string &id) |
| { |
| if (Lex::is_invalid_identifier(id)) |
| { |
| go_assert(saw_errors()); |
| return id; |
| } |
| |
| std::string ret; |
| const char* p = id.c_str(); |
| const char* pend = p + id.length(); |
| |
| // We encode a leading digit, to ensure that no identifier starts |
| // with a digit. |
| if (pend > p && p[0] >= '0' && p[0] <= '9') |
| { |
| char buf[8]; |
| snprintf(buf, sizeof buf, "_x%02x", p[0]); |
| ret.append(buf); |
| ++p; |
| } |
| |
| while (p < pend) |
| { |
| unsigned int c; |
| size_t len = fetch_utf8_char(p, &c); |
| if (len == 1) |
| { |
| if (!char_needs_encoding(c)) |
| ret.push_back(c); |
| else |
| { |
| char code = special_char_code.code_for(c); |
| if (code != 0) |
| { |
| ret.push_back('_'); |
| ret.push_back(code); |
| } |
| else |
| { |
| char buf[8]; |
| snprintf(buf, sizeof buf, "_x%02x", c); |
| ret.append(buf); |
| } |
| } |
| } |
| else |
| { |
| char buf[16]; |
| if (c < 0x10000) |
| snprintf(buf, sizeof buf, "_u%04x", c); |
| else |
| snprintf(buf, sizeof buf, "_U%08x", c); |
| ret.append(buf); |
| } |
| |
| p += len; |
| } |
| |
| return ret; |
| } |
| |
| // Convert a hex digit string to a unicode codepoint. No checking |
| // to insure that the hex digit is meaningful. |
| |
| static unsigned |
| hex_digits_to_unicode_codepoint(const char *digits, unsigned ndig) |
| { |
| unsigned result = 0; |
| for (unsigned i = 0; i < ndig; ++i) { |
| result <<= 4; |
| result |= Lex::hex_val(digits[i]); |
| } |
| return result; |
| } |
| |
| // Decode/demangle a mangled string produced by go_encode_id(). Returns |
| // empty string if demangling process fails in some way. At the moment |
| // this routine is unused; there is an equivalent routine in the runtime |
| // used for demangling symbols appearing in stack traces. |
| |
| std::string |
| go_decode_id(const std::string &encoded) |
| { |
| std::string ret; |
| const char* p = encoded.c_str(); |
| const char* pend = p + encoded.length(); |
| const Location loc = Linemap::predeclared_location(); |
| |
| while (p < pend) |
| { |
| if (*p != '_' || p + 1 == pend) |
| { |
| ret.push_back(*p); |
| p++; |
| continue; |
| } |
| |
| switch (p[1]) |
| { |
| case '_': |
| ret.push_back('_'); |
| p += 2; |
| break; |
| case '0': |
| ret.push_back('.'); |
| p += 2; |
| break; |
| case '1': |
| ret.push_back('/'); |
| p += 2; |
| break; |
| case '2': |
| ret.push_back('*'); |
| p += 2; |
| break; |
| case '3': |
| ret.push_back(','); |
| p += 2; |
| break; |
| case '4': |
| ret.push_back('{'); |
| p += 2; |
| break; |
| case '5': |
| ret.push_back('}'); |
| p += 2; |
| break; |
| case '6': |
| ret.push_back('['); |
| p += 2; |
| break; |
| case '7': |
| ret.push_back(']'); |
| p += 2; |
| break; |
| case '8': |
| ret.push_back('('); |
| p += 2; |
| break; |
| case '9': |
| ret.push_back(')'); |
| p += 2; |
| break; |
| case 'a': |
| ret.push_back('"'); |
| p += 2; |
| break; |
| case 'b': |
| ret.push_back(' '); |
| p += 2; |
| break; |
| case 'c': |
| ret.push_back(';'); |
| p += 2; |
| break; |
| case 'x': |
| { |
| const char* digits = p + 2; |
| if (strlen(digits) < 2) |
| return ""; |
| unsigned int rune = hex_digits_to_unicode_codepoint(digits, 2); |
| Lex::append_char(rune, true, &ret, loc); |
| p += 4; |
| } |
| break; |
| case 'u': |
| { |
| const char* digits = p + 2; |
| if (strlen(digits) < 4) |
| return ""; |
| unsigned int rune = hex_digits_to_unicode_codepoint(digits, 4); |
| Lex::append_char(rune, true, &ret, loc); |
| p += 6; |
| } |
| break; |
| case 'U': |
| { |
| const char* digits = p + 2; |
| if (strlen(digits) < 8) |
| return ""; |
| unsigned int rune = hex_digits_to_unicode_codepoint(digits, 8); |
| Lex::append_char(rune, true, &ret, loc); |
| p += 10; |
| } |
| break; |
| default: |
| return ""; |
| } |
| } |
| |
| return ret; |
| } |
| |
| // Encode a struct field tag. This is only used when we need to |
| // create a type descriptor for an anonymous struct type with field |
| // tags. Underscore encoding will be applied to the returned string. |
| // The tag will appear between curly braces, so that is all we have to |
| // avoid. |
| |
| std::string |
| go_mangle_struct_tag(const std::string& tag) |
| { |
| std::string ret; |
| const char* p = tag.c_str(); |
| const char* pend = p + tag.length(); |
| while (p < pend) |
| { |
| unsigned int c; |
| size_t len = fetch_utf8_char(p, &c); |
| if (len > 1) |
| ret.append(p, len); |
| else if (c != '{' && c != '}' && c != '\\') |
| ret.push_back(c); |
| else |
| { |
| ret.push_back('\\'); |
| ret.push_back(c); |
| } |
| p += len; |
| } |
| return ret; |
| } |