blob: b49e006fe9402310efa5db8e0ce33b04f8f57df9 [file] [log] [blame]
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// X86spec reads the ``IntelĀ® 64 and IA-32 Architectures Software Developer's Manual''
// to collect instruction encoding details and writes those details to standard output
// in CSV format.
//
// Usage:
//
// x86spec [-f file] [-u url] >x86.csv
//
// The -f flag specifies the input file (default x86manual.pdf), the Intel instruction
// set reference manual in PDF form.
// If the input file does not exist, it will be created by downloading the manual.
//
// The -u flag specifies the URL from which to download the manual
// (default https://golang.org/s/x86manual, which redirects to Intel's site).
// The URL is downloaded only when the file named by the -f flag is missing.
//
// There are additional debugging flags, not shown. Run x86spec -help for the list.
//
// File Format
//
// TODO: Mention comments at top of file.
// TODO: Mention that this is version 0.2 of the file.
// TODO: Mention that file format will change incompatibly until version 1.0.
//
// Each CSV line contains these fields:
//
// 1. The Intel manual instruction mnemonic. For example, "SHR r/m32, imm8".
//
// 2. The Go assembler instruction mnemonic. For example, "SHRL imm8, r/m32".
//
// 3. The GNU binutils instruction mnemonic. For example, "shrl imm8, r/m32".
//
// 4. The instruction encoding. For example, "C1 /4 ib".
//
// 5. The validity of the instruction in 32-bit (aka compatiblity, legacy) mode.
//
// 6. The validity of the instruction in 64-bit mode.
//
// 7. The CPUID feature flags that signal support for the instruction.
//
// 8. Additional comma-separated tags containing hints about the instruction.
//
// 9. The read/write actions of the instruction on the arguments used in
// the Intel mnemonic. For example, "rw,r" to denote that "SHR r/m32, imm8"
// reads and writes its first argument but only reads its second argument.
//
// 10. Whether the opcode used in the Intel mnemonic has encoding forms
// distinguished only by operand size, like most arithmetic instructions.
// The string "Y" indicates yes, the string "" indicates no.
//
// 11. The data size of the operation in bits. In general this is the size corresponding
// to the Go and GNU assembler opcode suffix.
//
// The complete line used for the above examples is:
//
// "SHR r/m32, imm8","SHRL imm8, r/m32","shrl imm8, r/m32","C1 /5 ib","V","V","","operand32","rw,r","Y","32"
//
// Mnemonics
//
// The instruction mnemonics are as used in the Intel manual, with a few exceptions.
//
// Mnemonics claiming general memory forms but that really require fixed addressing modes
// are omitted in favor of their equivalents with implicit arguments..
// For example, "CMPS m16, m16" (really CMPS [SI], [DI]) is omitted in favor of "CMPSW".
//
// Instruction forms with an explicit REP, REPE, or REPNE prefix are also omitted.
// Encoders and decoders are expected to handle those prefixes separately.
//
// Perhaps most significantly, the argument syntaxes used in the mnemonic indicate
// exactly how to derive the argument from the instruction encoding, or vice versa.
//
// Immediate values: imm8, imm8u, imm16, imm16u, imm32, imm64.
// Immediates are signed by default; the u suffixes indicates an unsigned value.
//
// Memory operands. The forms m, m128, m14/28byte, m16, m16&16, m16&32, m16&64, m16:16, m16:32,
// m16:64, m16int, m256, m2byte, m32, m32&32, m32fp, m32int, m512byte, m64, m64fp, m64int,
// m8, m80bcd, m80dec, m80fp, m94/108byte. These operands always correspond to the
// memory address specified by the r/m half of the modrm encoding.
//
// Integer registers.
// The forms r8, r16, r32, r64 indicate a register selected by the modrm reg encoding.
// The forms rmr16, rmr32, rmr64 indicate a register (never memory) selected by the modrm r/m encoding.
// The forms r/m8, r/m16, r/m32, and r/m64 indicate a register or memory selected by the modrm r/m encoding.
// Forms with two sizes, like r32/m16 also indicate a register or memory selected by the modrm r/m encodng,
// but the size for a register argument differs from the size of a memory argument.
// The forms r8V, r16V, r32V, r64V indicate a register selected by the VEX.vvvv bits.
//
// Multimedia registers.
// The forms mm1, xmm1, and ymm1 indicate a multimedia register selected by the
// modrm reg encoding.
// The forms mm2, xmm2, and ymm2 indicate a register (never memory) selected by
// the modrm r/m encoding.
// The forms mm2/m64, xmm2/m128, and so on indicate a register or memory
// selected by the modrm r/m encoding.
// The forms xmmV and ymmV indicate a register selected by the VEX.vvvv bits.
// The forms xmmI and ymmI indicate a register selected by the top four bits of an /is4 immediate byte.
//
// Bound registers.
// The form bnd1 indicate a bound register selected by the modrm reg encoding.
// The form bnd2 indicates a bound register (never memory) selected by the modrm r/m encoding.
// The forms bnd2/m64 and bnd2/m128 indicate a register or memorys selected by the modrm r/m encoding.
// TODO: Describe mib.
//
// One-of-a-kind operands: rel8, rel16, rel32, ptr16:16, ptr16:32,
// moffs8, moffs16, moffs32, moffs64, vm32x, vm32y, vm64x, and vm64y
// are all as in the Intel manual.
//
// Encodings
//
// The encodings are also as used in the Intel manual, with automated corrections.
// For example, the Intel manual sometimes omits the modrm /r indicator or other trailing bytes,
// and it also contains typographical errors.
// These problems are corrected so that the CSV data may be used to generate
// tools for processing x86 machine code.
// See https://golang.org/x/arch/x86/x86map for one such generator.
//
// Valid32 and Valid64
//
// These columns hold validity abbreviations as defined in the Intel manual:
// V, I, N.E., N.P., N.S., or N.I.
// Tools processing the data are typically only concerned with whether the
// column is "V" (valid) or not.
// This data is also corrected compared to the manual.
// For example, the manual lists many instruction forms using REX bytes
// with an incorrect "V" in the Valid32 column.
//
// CPUID Feature Flags
//
// This column specifies CPUID feature flags that must be present in order
// to use the instruction. If multiple flags are required,
// they are listed separated by plus signs, as in PCLMULQDQ+AVX.
// The column can also list one of the values 486, Pentium, PentiumII, and P6,
// indicating that the instruction was introduced on that architecture version.
//
// Tags
//
// The tag column does not correspond to a traditional column in the Intel manual tables.
// Instead, it is itself a comma-separated list of tags or hints derived by analysis
// of the instruction set or the instruction encodings.
//
// The tags address16, address32, and address64 indicate that the instruction form
// applies when using the specified addressing size. It may therefore be necessary to use an
// address size prefix byte to access the instruction.
// If two address tags are listed, the instruction can be used with either of those
// address sizes. An instruction will never list all three address sizes.
// (In fact, today, no instruction lists two address sizes, but that may change.)
//
// The tags operand16, operand32, and operand64 indicate that the instruction form
// applies when using the specified operand size. It may therefore be necessary to use an
// operand size prefix byte to access the instruction.
// If two operand tags are listed, the instruction can be used with either of those
// operand sizes. An instruction will never list all three operand sizes.
//
// The tags modrm_regonly or modrm_memonly indicate that the modrm byte's
// r/m encoding must specify a register or memory, respectively.
// Especially in newer instructions, the modrm constraint may be the only way
// to distinguish two instruction forms. For example the MOVHLPS and MOVLPS
// instructions share the same encoding, except that the former requires the
// modrm byte's r/m to indicate a register, while the latter requires it to indicate memory.
//
// The tags pseudo and pseudo64 indicate that this instruction form is redundant
// with others listed in the table and should be ignored when generating disassembly
// or instruction scanning programs. The pseudo64 tag is reserved for the case where
// the manual lists an instruction twice, once with the optional 64-bit mode REX byte.
// Since most decoders will handle the REX byte separately, the form with the
// unnecessary REX is tagged pseudo64.
//
// Corrections and Additions
//
// The x86spec program makes various corrections to the Intel manual data
// as part of extracting the information. Those corrections are described above.
//
// The x86spec program also adds a few well-known undocumented instructions,
// such as UD1 and FFREEP.
//
// Examples
//
// The latest version of the CSV file is available in this Git repository and also
// online at https://golang.org/s/x86.csv. It is meant to be human-readable for
// quick reference and also to be input for generating tools that operate on
// x86 machine code.
//
// To print instruction syntaxes introduced by the Pentium II and P6,
// using https://rsc.io/csv2tsv to prepare the table for processing by awk:
//
// csv2tsv x86.csv | awk -F'\t' '$5 == "PentiumII" || $5 == "P6" { print $1 }'
//
// The x86map program (https://golang.org/x/arch/x86/x86map)
// reads the CSV file and generates an x86 instruction decoder in the form
// of a simple byte-code program. This decoder is the core of the disassembler
// in the x86asm package (https://golang.org/x/arch/x86/x86asm).
//
package main
import (
"bufio"
"flag"
"fmt"
"io"
"log"
"net/http"
"os"
"sort"
"strings"
)
const (
specFormatVersion = "0.2"
)
var (
flagDebugPage = flag.String("debugpage", "", "debug page `n` of the manual (can be comma-separated list)")
flagURL = flag.String("u", "https://golang.org/s/x86manual", "use `url` for download if needed")
flagFile = flag.String("f", "x86manual.pdf", "read manual from `file`, downloading if necessary")
flagCompat = flag.Bool("compat", false, "print compatibility statements")
debugging bool
onlySomePages bool
)
type instruction struct {
page int
opcode string
syntax string
valid64 string
valid32 string
cpuid string
desc string
tags []string
args []string
seq int // for use by cleanup
compat string
action string
multisize string
datasize int
gnuSyntax string
goSyntax string
}
func main() {
log.SetFlags(0)
log.SetPrefix("x86spec: ")
flags()
download()
insts := parse()
insts = cleanup(insts)
format(insts)
sort.Sort(bySyntax(insts))
write(os.Stdout, insts)
}
func flags() {
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "usage: x86spec [options]\n")
flag.PrintDefaults()
os.Exit(2)
}
flag.Parse()
if flag.NArg() != 0 {
flag.Usage()
}
debugging = *flagDebugPage != ""
onlySomePages = *flagDebugPage != ""
}
func download() {
_, err := os.Stat(*flagFile)
if !os.IsNotExist(err) {
return
}
// Try downloading.
log.Printf("downloading manual to %s", *flagFile)
resp, err := http.Get(*flagURL)
if err != nil {
log.Fatal(err)
}
if resp.StatusCode != 200 {
log.Fatal(resp.Status)
}
f, err := os.Create(*flagFile)
if err != nil {
log.Fatal(err)
}
_, err = io.Copy(f, resp.Body)
if err != nil {
log.Fatal(err)
}
if err := f.Close(); err != nil {
log.Fatal(err)
}
}
func write(w io.Writer, insts []*instruction) {
bw := bufio.NewWriter(w)
defer bw.Flush()
for _, inst := range insts {
datasize := ""
if inst.datasize != 0 {
datasize = fmt.Sprint(inst.datasize)
}
writeCSV(bw, inst.syntax, inst.goSyntax, inst.gnuSyntax, inst.opcode, inst.valid32, inst.valid64, inst.cpuid, strings.Join(inst.tags, ","), inst.action, inst.multisize, datasize)
}
}
// Note: not using encoding/csv because we want the CSV to use quotes always,
// so that it is a little easier to process with non-CSV tools like grep,
// but the encoding/csv package does not have an "always quote" writing mode.
func writeCSV(w io.Writer, args ...string) {
for i, arg := range args {
if i > 0 {
fmt.Fprintf(w, ",")
}
fmt.Fprintf(w, `"%s"`, strings.Replace(arg, `"`, `""`, -1))
}
fmt.Fprintf(w, "\n")
}