gopls/internal/util/asm/parse.go - tools.git - Git at Google

 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Package asm provides a simple parser for Go assembly files.
 package asm

 import (
 	"bufio"
 	"bytes"
 	"fmt"
 	"strings"
 	"unicode"
 )

 // Kind describes the nature of an identifier in an assembly file.
 type Kind uint8

 const (
 	Invalid Kind = iota // reserved zero value; not used by Ident
 	Ref                 // arbitrary reference to symbol or control label
 	Text                // definition of TEXT (function) symbol
 	Global              // definition of GLOBL (var) symbol
 	Data                // initialization of GLOBL (var) symbol; effectively a reference
 	Label               // definition of control label
 )

 func (k Kind) String() string {
 	if int(k) < len(kindString) {
 		return kindString[k]
 	}
 	return fmt.Sprintf("Kind(%d)", k)
 }

 var kindString = [...]string{
 	Invalid: "invalid",
 	Ref:     "ref",
 	Text:    "text",
 	Global:  "global",
 	Data:    "data",
 	Label:   "label",
 }

 // A file represents a parsed file of Go assembly language.
 type File struct {
 	Idents []Ident

 	// TODO(adonovan): use token.File? This may be important in a
 	// future in which analyzers can report diagnostics in .s files.
 }

 // Ident represents an identifier in an assembly file.
 type Ident struct {
 	Name   string // symbol name (after correcting [·∕]); Name[0]='.' => current package
 	Offset int    // zero-based byte offset
 	Kind   Kind
 }

 // End returns the identifier's end offset.
 func (id Ident) End() int { return id.Offset + len(id.Name) }

 // Parse extracts identifiers from Go assembly files.
 // Since it is a best-effort parser, it never returns an error.
 func Parse(content []byte) *File {
 	var idents []Ident
 	offset := 0 // byte offset of start of current line

 	// TODO(adonovan) use a proper tokenizer that respects
 	// comments, string literals, line continuations, etc.
 	scan := bufio.NewScanner(bytes.NewReader(content))
 	for ; scan.Scan(); offset += len(scan.Bytes()) + len("\n") {
 		line := scan.Text()

 		// Strip comments.
 		if idx := strings.Index(line, "//"); idx >= 0 {
 			line = line[:idx]
 		}

 		// Skip blank lines.
 		if strings.TrimSpace(line) == "" {
 			continue
 		}

 		// Check for label definitions (ending with colon).
 		if colon := strings.IndexByte(line, ':'); colon > 0 {
 			label := strings.TrimSpace(line[:colon])
 			if isIdent(label) {
 				idents = append(idents, Ident{
 					Name:   label,
 					Offset: offset + strings.Index(line, label),
 					Kind:   Label,
 				})
 				continue
 			}
 		}

 		// Split line into words.
 		words := strings.Fields(line)
 		if len(words) == 0 {
 			continue
 		}

 		// A line of the form
 		//    TEXT ·sym<ABIInternal>(SB),NOSPLIT,$12
 		// declares a text symbol "·sym".
 		if len(words) > 1 {
 			kind := Invalid
 			switch words[0] {
 			case "TEXT":
 				kind = Text
 			case "GLOBL":
 				kind = Global
 			case "DATA":
 				kind = Data
 			}
 			if kind != Invalid {
 				sym := words[1]
 				sym = cutBefore(sym, ",") // strip ",NOSPLIT,$12" etc
 				sym = cutBefore(sym, "(") // "sym(SB)" -> "sym"
 				sym = cutBefore(sym, "<") // "sym<ABIInternal>" -> "sym"
 				sym = strings.TrimSpace(sym)
 				if isIdent(sym) {
 					// (The Index call assumes sym is not itself "TEXT" etc.)
 					idents = append(idents, Ident{
 						Name:   cleanup(sym),
 						Kind:   kind,
 						Offset: offset + strings.Index(line, sym),
 					})
 				}
 				continue
 			}
 		}

 		// Find references in the rest of the line.
 		pos := 0
 		for _, word := range words {
 			// Find actual position of word within line.
 			tokenPos := strings.Index(line[pos:], word)
 			if tokenPos < 0 {
 				panic(line)
 			}
 			tokenPos += pos
 			pos = tokenPos + len(word)

 			// Reject probable instruction mnemonics (e.g. MOV).
 			if len(word) >= 2 && word[0] != '·' &&
 				!strings.ContainsFunc(word, unicode.IsLower) {
 				continue
 			}

 			if word[0] == '$' {
 				word = word[1:]
 				tokenPos++

 				// Reject probable immediate values (e.g. "$123").
 				if !strings.ContainsFunc(word, isNonDigit) {
 					continue
 				}
 			}

 			// Reject probably registers (e.g. "PC").
 			if len(word) <= 3 && !strings.ContainsFunc(word, unicode.IsLower) {
 				continue
 			}

 			// Probable identifier reference.
 			//
 			// TODO(adonovan): handle FP symbols correctly;
 			// sym+8(FP) is essentially a comment about
 			// stack slot 8, not a reference to a symbol
 			// with a declaration somewhere; so they form
 			// an equivalence class without a canonical
 			// declaration.
 			//
 			// TODO(adonovan): handle pseudoregisters and field
 			// references such as:
 			//    MOVD	$runtime·g0(SB), g      // pseudoreg
 			//    MOVD	R0, g_stackguard0(g)    // field ref

 			sym := cutBefore(word, "(") // "·sym(SB)" => "sym"
 			sym = cutBefore(sym, "+")   // "sym+8(FP)" => "sym"
 			sym = cutBefore(sym, "<")   // "sym<ABIInternal>" =>> "sym"
 			if isIdent(sym) {
 				idents = append(idents, Ident{
 					Name:   cleanup(sym),
 					Kind:   Ref,
 					Offset: offset + tokenPos,
 				})
 			}
 		}
 	}

 	_ = scan.Err() // ignore scan errors

 	return &File{Idents: idents}
 }

 // isIdent reports whether s is a valid Go assembly identifier.
 func isIdent(s string) bool {
 	for i, r := range s {
 		if !isIdentRune(r, i) {
 			return false
 		}
 	}
 	return len(s) > 0
 }

 // cutBefore returns the portion of s before the first occurrence of sep, if any.
 func cutBefore(s, sep string) string {
 	if before, _, ok := strings.Cut(s, sep); ok {
 		return before
 	}
 	return s
 }

 // cleanup converts a symbol name from assembler syntax to linker syntax.
 func cleanup(sym string) string {
 	return repl.Replace(sym)
 }

 var repl = strings.NewReplacer(
 	"·", ".", // (U+00B7 MIDDLE DOT)
 	"∕", "/", // (U+2215 DIVISION SLASH)
 )

 func isNonDigit(r rune) bool { return !unicode.IsDigit(r) }

 // -- plundered from GOROOT/src/cmd/asm/internal/asm/parse.go --

 // We want center dot (·) and division slash (∕) to work as identifier characters.
 func isIdentRune(ch rune, i int) bool {
 	if unicode.IsLetter(ch) {
 		return true
 	}
 	switch ch {
 	case '_': // Underscore; traditional.
 		return true
 	case '\u00B7': // Represents the period in runtime.exit. U+00B7 '·' middle dot
 		return true
 	case '\u2215': // Represents the slash in runtime/debug.setGCPercent. U+2215 '∕' division slash
 		return true
 	}
 	// Digits are OK only after the first character.
 	return i > 0 && unicode.IsDigit(ch)
 }
	// Copyright 2025 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// Package asm provides a simple parser for Go assembly files.
	package asm

	import (
	"bufio"
	"bytes"
	"fmt"
	"strings"
	"unicode"
	)

	// Kind describes the nature of an identifier in an assembly file.
	type Kind uint8

	const (
	Invalid Kind = iota // reserved zero value; not used by Ident
	Ref // arbitrary reference to symbol or control label
	Text // definition of TEXT (function) symbol
	Global // definition of GLOBL (var) symbol
	Data // initialization of GLOBL (var) symbol; effectively a reference
	Label // definition of control label
	)

	func (k Kind) String() string {
	if int(k) < len(kindString) {
	return kindString[k]
	}
	return fmt.Sprintf("Kind(%d)", k)
	}

	var kindString = [...]string{
	Invalid: "invalid",
	Ref: "ref",
	Text: "text",
	Global: "global",
	Data: "data",
	Label: "label",
	}

	// A file represents a parsed file of Go assembly language.
	type File struct {
	Idents []Ident

	// TODO(adonovan): use token.File? This may be important in a
	// future in which analyzers can report diagnostics in .s files.
	}

	// Ident represents an identifier in an assembly file.
	type Ident struct {
	Name string // symbol name (after correcting [·∕]); Name[0]='.' => current package
	Offset int // zero-based byte offset
	Kind Kind
	}

	// End returns the identifier's end offset.
	func (id Ident) End() int { return id.Offset + len(id.Name) }

	// Parse extracts identifiers from Go assembly files.
	// Since it is a best-effort parser, it never returns an error.
	func Parse(content []byte) *File {
	var idents []Ident
	offset := 0 // byte offset of start of current line

	// TODO(adonovan) use a proper tokenizer that respects
	// comments, string literals, line continuations, etc.
	scan := bufio.NewScanner(bytes.NewReader(content))
	for ; scan.Scan(); offset += len(scan.Bytes()) + len("\n") {
	line := scan.Text()

	// Strip comments.
	if idx := strings.Index(line, "//"); idx >= 0 {
	line = line[:idx]
	}

	// Skip blank lines.
	if strings.TrimSpace(line) == "" {
	continue
	}

	// Check for label definitions (ending with colon).
	if colon := strings.IndexByte(line, ':'); colon > 0 {
	label := strings.TrimSpace(line[:colon])
	if isIdent(label) {
	idents = append(idents, Ident{
	Name: label,
	Offset: offset + strings.Index(line, label),
	Kind: Label,
	})
	continue
	}
	}

	// Split line into words.
	words := strings.Fields(line)
	if len(words) == 0 {
	continue
	}

	// A line of the form
	// TEXT ·sym<ABIInternal>(SB),NOSPLIT,$12
	// declares a text symbol "·sym".
	if len(words) > 1 {
	kind := Invalid
	switch words[0] {
	case "TEXT":
	kind = Text
	case "GLOBL":
	kind = Global
	case "DATA":
	kind = Data
	}
	if kind != Invalid {
	sym := words[1]
	sym = cutBefore(sym, ",") // strip ",NOSPLIT,$12" etc
	sym = cutBefore(sym, "(") // "sym(SB)" -> "sym"
	sym = cutBefore(sym, "<") // "sym<ABIInternal>" -> "sym"
	sym = strings.TrimSpace(sym)
	if isIdent(sym) {
	// (The Index call assumes sym is not itself "TEXT" etc.)
	idents = append(idents, Ident{
	Name: cleanup(sym),
	Kind: kind,
	Offset: offset + strings.Index(line, sym),
	})
	}
	continue
	}
	}

	// Find references in the rest of the line.
	pos := 0
	for _, word := range words {
	// Find actual position of word within line.
	tokenPos := strings.Index(line[pos:], word)
	if tokenPos < 0 {
	panic(line)
	}
	tokenPos += pos
	pos = tokenPos + len(word)

	// Reject probable instruction mnemonics (e.g. MOV).
	if len(word) >= 2 && word[0] != '·' &&
	!strings.ContainsFunc(word, unicode.IsLower) {
	continue
	}

	if word[0] == '$' {
	word = word[1:]
	tokenPos++

	// Reject probable immediate values (e.g. "$123").
	if !strings.ContainsFunc(word, isNonDigit) {
	continue
	}
	}

	// Reject probably registers (e.g. "PC").
	if len(word) <= 3 && !strings.ContainsFunc(word, unicode.IsLower) {
	continue
	}

	// Probable identifier reference.
	//
	// TODO(adonovan): handle FP symbols correctly;
	// sym+8(FP) is essentially a comment about
	// stack slot 8, not a reference to a symbol
	// with a declaration somewhere; so they form
	// an equivalence class without a canonical
	// declaration.
	//
	// TODO(adonovan): handle pseudoregisters and field
	// references such as:
	// MOVD $runtime·g0(SB), g // pseudoreg
	// MOVD R0, g_stackguard0(g) // field ref

	sym := cutBefore(word, "(") // "·sym(SB)" => "sym"
	sym = cutBefore(sym, "+") // "sym+8(FP)" => "sym"
	sym = cutBefore(sym, "<") // "sym<ABIInternal>" =>> "sym"
	if isIdent(sym) {
	idents = append(idents, Ident{
	Name: cleanup(sym),
	Kind: Ref,
	Offset: offset + tokenPos,
	})
	}
	}
	}

	_ = scan.Err() // ignore scan errors

	return &File{Idents: idents}
	}

	// isIdent reports whether s is a valid Go assembly identifier.
	func isIdent(s string) bool {
	for i, r := range s {
	if !isIdentRune(r, i) {
	return false
	}
	}
	return len(s) > 0
	}

	// cutBefore returns the portion of s before the first occurrence of sep, if any.
	func cutBefore(s, sep string) string {
	if before, _, ok := strings.Cut(s, sep); ok {
	return before
	}
	return s
	}

	// cleanup converts a symbol name from assembler syntax to linker syntax.
	func cleanup(sym string) string {
	return repl.Replace(sym)
	}

	var repl = strings.NewReplacer(
	"·", ".", // (U+00B7 MIDDLE DOT)
	"∕", "/", // (U+2215 DIVISION SLASH)
	)

	func isNonDigit(r rune) bool { return !unicode.IsDigit(r) }

	// -- plundered from GOROOT/src/cmd/asm/internal/asm/parse.go --

	// We want center dot (·) and division slash (∕) to work as identifier characters.
	func isIdentRune(ch rune, i int) bool {
	if unicode.IsLetter(ch) {
	return true
	}
	switch ch {
	case '_': // Underscore; traditional.
	return true
	case '\u00B7': // Represents the period in runtime.exit. U+00B7 '·' middle dot
	return true
	case '\u2215': // Represents the slash in runtime/debug.setGCPercent. U+2215 '∕' division slash
	return true
	}
	// Digits are OK only after the first character.
	return i > 0 && unicode.IsDigit(ch)
	}