src/cmd/internal/obj/x86/asm6.go - go - Git at Google

 // Inferno utils/6l/span.c
 // https://bitbucket.org/inferno-os/inferno-os/src/master/utils/6l/span.c
 //
 //	Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
 //	Portions Copyright © 1995-1997 C H Forsyth (forsyth@terzarima.net)
 //	Portions Copyright © 1997-1999 Vita Nuova Limited
 //	Portions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com)
 //	Portions Copyright © 2004,2006 Bruce Ellis
 //	Portions Copyright © 2005-2007 C H Forsyth (forsyth@terzarima.net)
 //	Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
 //	Portions Copyright © 2009 The Go Authors. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.

 package x86

 import (
 	"cmd/internal/obj"
 	"cmd/internal/objabi"
 	"cmd/internal/sys"
 	"encoding/binary"
 	"fmt"
 	"internal/buildcfg"
 	"log"
 	"strings"
 )

 var (
 	plan9privates *obj.LSym
 )

 // Instruction layout.

 // Loop alignment constants:
 // want to align loop entry to loopAlign-byte boundary,
 // and willing to insert at most maxLoopPad bytes of NOP to do so.
 // We define a loop entry as the target of a backward jump.
 //
 // gcc uses maxLoopPad = 10 for its 'generic x86-64' config,
 // and it aligns all jump targets, not just backward jump targets.
 //
 // As of 6/1/2012, the effect of setting maxLoopPad = 10 here
 // is very slight but negative, so the alignment is disabled by
 // setting MaxLoopPad = 0. The code is here for reference and
 // for future experiments.
 //
 const (
 	loopAlign  = 16
 	maxLoopPad = 0
 )

 // Bit flags that are used to express jump target properties.
 const (
 	// branchBackwards marks targets that are located behind.
 	// Used to express jumps to loop headers.
 	branchBackwards = (1 << iota)
 	// branchShort marks branches those target is close,
 	// with offset is in -128..127 range.
 	branchShort
 	// branchLoopHead marks loop entry.
 	// Used to insert padding for misaligned loops.
 	branchLoopHead
 )

 // opBytes holds optab encoding bytes.
 // Each ytab reserves fixed amount of bytes in this array.
 //
 // The size should be the minimal number of bytes that
 // are enough to hold biggest optab op lines.
 type opBytes [31]uint8

 type Optab struct {
 	as     obj.As
 	ytab   []ytab
 	prefix uint8
 	op     opBytes
 }

 type movtab struct {
 	as   obj.As
 	ft   uint8
 	f3t  uint8
 	tt   uint8
 	code uint8
 	op   [4]uint8
 }

 const (
 	Yxxx = iota
 	Ynone
 	Yi0 // $0
 	Yi1 // $1
 	Yu2 // $x, x fits in uint2
 	Yi8 // $x, x fits in int8
 	Yu8 // $x, x fits in uint8
 	Yu7 // $x, x in 0..127 (fits in both int8 and uint8)
 	Ys32
 	Yi32
 	Yi64
 	Yiauto
 	Yal
 	Ycl
 	Yax
 	Ycx
 	Yrb
 	Yrl
 	Yrl32 // Yrl on 32-bit system
 	Yrf
 	Yf0
 	Yrx
 	Ymb
 	Yml
 	Ym
 	Ybr
 	Ycs
 	Yss
 	Yds
 	Yes
 	Yfs
 	Ygs
 	Ygdtr
 	Yidtr
 	Yldtr
 	Ymsw
 	Ytask
 	Ycr0
 	Ycr1
 	Ycr2
 	Ycr3
 	Ycr4
 	Ycr5
 	Ycr6
 	Ycr7
 	Ycr8
 	Ydr0
 	Ydr1
 	Ydr2
 	Ydr3
 	Ydr4
 	Ydr5
 	Ydr6
 	Ydr7
 	Ytr0
 	Ytr1
 	Ytr2
 	Ytr3
 	Ytr4
 	Ytr5
 	Ytr6
 	Ytr7
 	Ymr
 	Ymm
 	Yxr0          // X0 only. "<XMM0>" notation in Intel manual.
 	YxrEvexMulti4 // [ X<n> - X<n+3> ]; multisource YxrEvex
 	Yxr           // X0..X15
 	YxrEvex       // X0..X31
 	Yxm
 	YxmEvex       // YxrEvex+Ym
 	Yxvm          // VSIB vector array; vm32x/vm64x
 	YxvmEvex      // Yxvm which permits High-16 X register as index.
 	YyrEvexMulti4 // [ Y<n> - Y<n+3> ]; multisource YyrEvex
 	Yyr           // Y0..Y15
 	YyrEvex       // Y0..Y31
 	Yym
 	YymEvex   // YyrEvex+Ym
 	Yyvm      // VSIB vector array; vm32y/vm64y
 	YyvmEvex  // Yyvm which permits High-16 Y register as index.
 	YzrMulti4 // [ Z<n> - Z<n+3> ]; multisource YzrEvex
 	Yzr       // Z0..Z31
 	Yzm       // Yzr+Ym
 	Yzvm      // VSIB vector array; vm32z/vm64z
 	Yk0       // K0
 	Yknot0    // K1..K7; write mask
 	Yk        // K0..K7; used for KOP
 	Ykm       // Yk+Ym; used for KOP
 	Ytls
 	Ytextsize
 	Yindir
 	Ymax
 )

 const (
 	Zxxx = iota
 	Zlit
 	Zlitm_r
 	Zlitr_m
 	Zlit_m_r
 	Z_rp
 	Zbr
 	Zcall
 	Zcallcon
 	Zcallduff
 	Zcallind
 	Zcallindreg
 	Zib_
 	Zib_rp
 	Zibo_m
 	Zibo_m_xm
 	Zil_
 	Zil_rp
 	Ziq_rp
 	Zilo_m
 	Zjmp
 	Zjmpcon
 	Zloop
 	Zo_iw
 	Zm_o
 	Zm_r
 	Z_m_r
 	Zm2_r
 	Zm_r_xm
 	Zm_r_i_xm
 	Zm_r_xm_nr
 	Zr_m_xm_nr
 	Zibm_r // mmx1,mmx2/mem64,imm8
 	Zibr_m
 	Zmb_r
 	Zaut_r
 	Zo_m
 	Zo_m64
 	Zpseudo
 	Zr_m
 	Zr_m_xm
 	Zrp_
 	Z_ib
 	Z_il
 	Zm_ibo
 	Zm_ilo
 	Zib_rr
 	Zil_rr
 	Zbyte

 	Zvex_rm_v_r
 	Zvex_rm_v_ro
 	Zvex_r_v_rm
 	Zvex_i_rm_vo
 	Zvex_v_rm_r
 	Zvex_i_rm_r
 	Zvex_i_r_v
 	Zvex_i_rm_v_r
 	Zvex
 	Zvex_rm_r_vo
 	Zvex_i_r_rm
 	Zvex_hr_rm_v_r

 	Zevex_first
 	Zevex_i_r_k_rm
 	Zevex_i_r_rm
 	Zevex_i_rm_k_r
 	Zevex_i_rm_k_vo
 	Zevex_i_rm_r
 	Zevex_i_rm_v_k_r
 	Zevex_i_rm_v_r
 	Zevex_i_rm_vo
 	Zevex_k_rmo
 	Zevex_r_k_rm
 	Zevex_r_v_k_rm
 	Zevex_r_v_rm
 	Zevex_rm_k_r
 	Zevex_rm_v_k_r
 	Zevex_rm_v_r
 	Zevex_last

 	Zmax
 )

 const (
 	Px   = 0
 	Px1  = 1    // symbolic; exact value doesn't matter
 	P32  = 0x32 // 32-bit only
 	Pe   = 0x66 // operand escape
 	Pm   = 0x0f // 2byte opcode escape
 	Pq   = 0xff // both escapes: 66 0f
 	Pb   = 0xfe // byte operands
 	Pf2  = 0xf2 // xmm escape 1: f2 0f
 	Pf3  = 0xf3 // xmm escape 2: f3 0f
 	Pef3 = 0xf5 // xmm escape 2 with 16-bit prefix: 66 f3 0f
 	Pq3  = 0x67 // xmm escape 3: 66 48 0f
 	Pq4  = 0x68 // xmm escape 4: 66 0F 38
 	Pq4w = 0x69 // Pq4 with Rex.w 66 0F 38
 	Pq5  = 0x6a // xmm escape 5: F3 0F 38
 	Pq5w = 0x6b // Pq5 with Rex.w F3 0F 38
 	Pfw  = 0xf4 // Pf3 with Rex.w: f3 48 0f
 	Pw   = 0x48 // Rex.w
 	Pw8  = 0x90 // symbolic; exact value doesn't matter
 	Py   = 0x80 // defaults to 64-bit mode
 	Py1  = 0x81 // symbolic; exact value doesn't matter
 	Py3  = 0x83 // symbolic; exact value doesn't matter
 	Pavx = 0x84 // symbolic: exact value doesn't matter

 	RxrEvex = 1 << 4 // AVX512 extension to REX.R/VEX.R
 	Rxw     = 1 << 3 // =1, 64-bit operand size
 	Rxr     = 1 << 2 // extend modrm reg
 	Rxx     = 1 << 1 // extend sib index
 	Rxb     = 1 << 0 // extend modrm r/m, sib base, or opcode reg
 )

 const (
 	// Encoding for VEX prefix in tables.
 	// The P, L, and W fields are chosen to match
 	// their eventual locations in the VEX prefix bytes.

 	// Encoding for VEX prefix in tables.
 	// The P, L, and W fields are chosen to match
 	// their eventual locations in the VEX prefix bytes.

 	// Using spare bit to make leading [E]VEX encoding byte different from
 	// 0x0f even if all other VEX fields are 0.
 	avxEscape = 1 << 6

 	// P field - 2 bits
 	vex66 = 1 << 0
 	vexF3 = 2 << 0
 	vexF2 = 3 << 0
 	// L field - 1 bit
 	vexLZ  = 0 << 2
 	vexLIG = 0 << 2
 	vex128 = 0 << 2
 	vex256 = 1 << 2
 	// W field - 1 bit
 	vexWIG = 0 << 7
 	vexW0  = 0 << 7
 	vexW1  = 1 << 7
 	// M field - 5 bits, but mostly reserved; we can store up to 3
 	vex0F   = 1 << 3
 	vex0F38 = 2 << 3
 	vex0F3A = 3 << 3
 )

 var ycover [Ymax * Ymax]uint8

 var reg [MAXREG]int

 var regrex [MAXREG + 1]int

 var ynone = []ytab{
 	{Zlit, 1, argList{}},
 }

 var ytext = []ytab{
 	{Zpseudo, 0, argList{Ymb, Ytextsize}},
 	{Zpseudo, 1, argList{Ymb, Yi32, Ytextsize}},
 }

 var ynop = []ytab{
 	{Zpseudo, 0, argList{}},
 	{Zpseudo, 0, argList{Yiauto}},
 	{Zpseudo, 0, argList{Yml}},
 	{Zpseudo, 0, argList{Yrf}},
 	{Zpseudo, 0, argList{Yxr}},
 	{Zpseudo, 0, argList{Yiauto}},
 	{Zpseudo, 0, argList{Yml}},
 	{Zpseudo, 0, argList{Yrf}},
 	{Zpseudo, 1, argList{Yxr}},
 }

 var yfuncdata = []ytab{
 	{Zpseudo, 0, argList{Yi32, Ym}},
 }

 var ypcdata = []ytab{
 	{Zpseudo, 0, argList{Yi32, Yi32}},
 }

 var yxorb = []ytab{
 	{Zib_, 1, argList{Yi32, Yal}},
 	{Zibo_m, 2, argList{Yi32, Ymb}},
 	{Zr_m, 1, argList{Yrb, Ymb}},
 	{Zm_r, 1, argList{Ymb, Yrb}},
 }

 var yaddl = []ytab{
 	{Zibo_m, 2, argList{Yi8, Yml}},
 	{Zil_, 1, argList{Yi32, Yax}},
 	{Zilo_m, 2, argList{Yi32, Yml}},
 	{Zr_m, 1, argList{Yrl, Yml}},
 	{Zm_r, 1, argList{Yml, Yrl}},
 }

 var yincl = []ytab{
 	{Z_rp, 1, argList{Yrl}},
 	{Zo_m, 2, argList{Yml}},
 }

 var yincq = []ytab{
 	{Zo_m, 2, argList{Yml}},
 }

 var ycmpb = []ytab{
 	{Z_ib, 1, argList{Yal, Yi32}},
 	{Zm_ibo, 2, argList{Ymb, Yi32}},
 	{Zm_r, 1, argList{Ymb, Yrb}},
 	{Zr_m, 1, argList{Yrb, Ymb}},
 }

 var ycmpl = []ytab{
 	{Zm_ibo, 2, argList{Yml, Yi8}},
 	{Z_il, 1, argList{Yax, Yi32}},
 	{Zm_ilo, 2, argList{Yml, Yi32}},
 	{Zm_r, 1, argList{Yml, Yrl}},
 	{Zr_m, 1, argList{Yrl, Yml}},
 }

 var yshb = []ytab{
 	{Zo_m, 2, argList{Yi1, Ymb}},
 	{Zibo_m, 2, argList{Yu8, Ymb}},
 	{Zo_m, 2, argList{Ycx, Ymb}},
 }

 var yshl = []ytab{
 	{Zo_m, 2, argList{Yi1, Yml}},
 	{Zibo_m, 2, argList{Yu8, Yml}},
 	{Zo_m, 2, argList{Ycl, Yml}},
 	{Zo_m, 2, argList{Ycx, Yml}},
 }

 var ytestl = []ytab{
 	{Zil_, 1, argList{Yi32, Yax}},
 	{Zilo_m, 2, argList{Yi32, Yml}},
 	{Zr_m, 1, argList{Yrl, Yml}},
 	{Zm_r, 1, argList{Yml, Yrl}},
 }

 var ymovb = []ytab{
 	{Zr_m, 1, argList{Yrb, Ymb}},
 	{Zm_r, 1, argList{Ymb, Yrb}},
 	{Zib_rp, 1, argList{Yi32, Yrb}},
 	{Zibo_m, 2, argList{Yi32, Ymb}},
 }

 var ybtl = []ytab{
 	{Zibo_m, 2, argList{Yi8, Yml}},
 	{Zr_m, 1, argList{Yrl, Yml}},
 }

 var ymovw = []ytab{
 	{Zr_m, 1, argList{Yrl, Yml}},
 	{Zm_r, 1, argList{Yml, Yrl}},
 	{Zil_rp, 1, argList{Yi32, Yrl}},
 	{Zilo_m, 2, argList{Yi32, Yml}},
 	{Zaut_r, 2, argList{Yiauto, Yrl}},
 }

 var ymovl = []ytab{
 	{Zr_m, 1, argList{Yrl, Yml}},
 	{Zm_r, 1, argList{Yml, Yrl}},
 	{Zil_rp, 1, argList{Yi32, Yrl}},
 	{Zilo_m, 2, argList{Yi32, Yml}},
 	{Zm_r_xm, 1, argList{Yml, Ymr}}, // MMX MOVD
 	{Zr_m_xm, 1, argList{Ymr, Yml}}, // MMX MOVD
 	{Zm_r_xm, 2, argList{Yml, Yxr}}, // XMM MOVD (32 bit)
 	{Zr_m_xm, 2, argList{Yxr, Yml}}, // XMM MOVD (32 bit)
 	{Zaut_r, 2, argList{Yiauto, Yrl}},
 }

 var yret = []ytab{
 	{Zo_iw, 1, argList{}},
 	{Zo_iw, 1, argList{Yi32}},
 }

 var ymovq = []ytab{
 	// valid in 32-bit mode
 	{Zm_r_xm_nr, 1, argList{Ym, Ymr}},  // 0x6f MMX MOVQ (shorter encoding)
 	{Zr_m_xm_nr, 1, argList{Ymr, Ym}},  // 0x7f MMX MOVQ
 	{Zm_r_xm_nr, 2, argList{Yxr, Ymr}}, // Pf2, 0xd6 MOVDQ2Q
 	{Zm_r_xm_nr, 2, argList{Yxm, Yxr}}, // Pf3, 0x7e MOVQ xmm1/m64 -> xmm2
 	{Zr_m_xm_nr, 2, argList{Yxr, Yxm}}, // Pe, 0xd6 MOVQ xmm1 -> xmm2/m64

 	// valid only in 64-bit mode, usually with 64-bit prefix
 	{Zr_m, 1, argList{Yrl, Yml}},      // 0x89
 	{Zm_r, 1, argList{Yml, Yrl}},      // 0x8b
 	{Zilo_m, 2, argList{Ys32, Yrl}},   // 32 bit signed 0xc7,(0)
 	{Ziq_rp, 1, argList{Yi64, Yrl}},   // 0xb8 -- 32/64 bit immediate
 	{Zilo_m, 2, argList{Yi32, Yml}},   // 0xc7,(0)
 	{Zm_r_xm, 1, argList{Ymm, Ymr}},   // 0x6e MMX MOVD
 	{Zr_m_xm, 1, argList{Ymr, Ymm}},   // 0x7e MMX MOVD
 	{Zm_r_xm, 2, argList{Yml, Yxr}},   // Pe, 0x6e MOVD xmm load
 	{Zr_m_xm, 2, argList{Yxr, Yml}},   // Pe, 0x7e MOVD xmm store
 	{Zaut_r, 1, argList{Yiauto, Yrl}}, // 0 built-in LEAQ
 }

 var ymovbe = []ytab{
 	{Zlitm_r, 3, argList{Ym, Yrl}},
 	{Zlitr_m, 3, argList{Yrl, Ym}},
 }

 var ym_rl = []ytab{
 	{Zm_r, 1, argList{Ym, Yrl}},
 }

 var yrl_m = []ytab{
 	{Zr_m, 1, argList{Yrl, Ym}},
 }

 var ymb_rl = []ytab{
 	{Zmb_r, 1, argList{Ymb, Yrl}},
 }

 var yml_rl = []ytab{
 	{Zm_r, 1, argList{Yml, Yrl}},
 }

 var yrl_ml = []ytab{
 	{Zr_m, 1, argList{Yrl, Yml}},
 }

 var yml_mb = []ytab{
 	{Zr_m, 1, argList{Yrb, Ymb}},
 	{Zm_r, 1, argList{Ymb, Yrb}},
 }

 var yrb_mb = []ytab{
 	{Zr_m, 1, argList{Yrb, Ymb}},
 }

 var yxchg = []ytab{
 	{Z_rp, 1, argList{Yax, Yrl}},
 	{Zrp_, 1, argList{Yrl, Yax}},
 	{Zr_m, 1, argList{Yrl, Yml}},
 	{Zm_r, 1, argList{Yml, Yrl}},
 }

 var ydivl = []ytab{
 	{Zm_o, 2, argList{Yml}},
 }

 var ydivb = []ytab{
 	{Zm_o, 2, argList{Ymb}},
 }

 var yimul = []ytab{
 	{Zm_o, 2, argList{Yml}},
 	{Zib_rr, 1, argList{Yi8, Yrl}},
 	{Zil_rr, 1, argList{Yi32, Yrl}},
 	{Zm_r, 2, argList{Yml, Yrl}},
 }

 var yimul3 = []ytab{
 	{Zibm_r, 2, argList{Yi8, Yml, Yrl}},
 	{Zibm_r, 2, argList{Yi32, Yml, Yrl}},
 }

 var ybyte = []ytab{
 	{Zbyte, 1, argList{Yi64}},
 }

 var yin = []ytab{
 	{Zib_, 1, argList{Yi32}},
 	{Zlit, 1, argList{}},
 }

 var yint = []ytab{
 	{Zib_, 1, argList{Yi32}},
 }

 var ypushl = []ytab{
 	{Zrp_, 1, argList{Yrl}},
 	{Zm_o, 2, argList{Ym}},
 	{Zib_, 1, argList{Yi8}},
 	{Zil_, 1, argList{Yi32}},
 }

 var ypopl = []ytab{
 	{Z_rp, 1, argList{Yrl}},
 	{Zo_m, 2, argList{Ym}},
 }

 var ywrfsbase = []ytab{
 	{Zm_o, 2, argList{Yrl}},
 }

 var yrdrand = []ytab{
 	{Zo_m, 2, argList{Yrl}},
 }

 var yclflush = []ytab{
 	{Zo_m, 2, argList{Ym}},
 }

 var ybswap = []ytab{
 	{Z_rp, 2, argList{Yrl}},
 }

 var yscond = []ytab{
 	{Zo_m, 2, argList{Ymb}},
 }

 var yjcond = []ytab{
 	{Zbr, 0, argList{Ybr}},
 	{Zbr, 0, argList{Yi0, Ybr}},
 	{Zbr, 1, argList{Yi1, Ybr}},
 }

 var yloop = []ytab{
 	{Zloop, 1, argList{Ybr}},
 }

 var ycall = []ytab{
 	{Zcallindreg, 0, argList{Yml}},
 	{Zcallindreg, 2, argList{Yrx, Yrx}},
 	{Zcallind, 2, argList{Yindir}},
 	{Zcall, 0, argList{Ybr}},
 	{Zcallcon, 1, argList{Yi32}},
 }

 var yduff = []ytab{
 	{Zcallduff, 1, argList{Yi32}},
 }

 var yjmp = []ytab{
 	{Zo_m64, 2, argList{Yml}},
 	{Zjmp, 0, argList{Ybr}},
 	{Zjmpcon, 1, argList{Yi32}},
 }

 var yfmvd = []ytab{
 	{Zm_o, 2, argList{Ym, Yf0}},
 	{Zo_m, 2, argList{Yf0, Ym}},
 	{Zm_o, 2, argList{Yrf, Yf0}},
 	{Zo_m, 2, argList{Yf0, Yrf}},
 }

 var yfmvdp = []ytab{
 	{Zo_m, 2, argList{Yf0, Ym}},
 	{Zo_m, 2, argList{Yf0, Yrf}},
 }

 var yfmvf = []ytab{
 	{Zm_o, 2, argList{Ym, Yf0}},
 	{Zo_m, 2, argList{Yf0, Ym}},
 }

 var yfmvx = []ytab{
 	{Zm_o, 2, argList{Ym, Yf0}},
 }

 var yfmvp = []ytab{
 	{Zo_m, 2, argList{Yf0, Ym}},
 }

 var yfcmv = []ytab{
 	{Zm_o, 2, argList{Yrf, Yf0}},
 }

 var yfadd = []ytab{
 	{Zm_o, 2, argList{Ym, Yf0}},
 	{Zm_o, 2, argList{Yrf, Yf0}},
 	{Zo_m, 2, argList{Yf0, Yrf}},
 }

 var yfxch = []ytab{
 	{Zo_m, 2, argList{Yf0, Yrf}},
 	{Zm_o, 2, argList{Yrf, Yf0}},
 }

 var ycompp = []ytab{
 	{Zo_m, 2, argList{Yf0, Yrf}}, // botch is really f0,f1
 }

 var ystsw = []ytab{
 	{Zo_m, 2, argList{Ym}},
 	{Zlit, 1, argList{Yax}},
 }

 var ysvrs_mo = []ytab{
 	{Zm_o, 2, argList{Ym}},
 }

 // unaryDst version of "ysvrs_mo".
 var ysvrs_om = []ytab{
 	{Zo_m, 2, argList{Ym}},
 }

 var ymm = []ytab{
 	{Zm_r_xm, 1, argList{Ymm, Ymr}},
 	{Zm_r_xm, 2, argList{Yxm, Yxr}},
 }

 var yxm = []ytab{
 	{Zm_r_xm, 1, argList{Yxm, Yxr}},
 }

 var yxm_q4 = []ytab{
 	{Zm_r, 1, argList{Yxm, Yxr}},
 }

 var yxcvm1 = []ytab{
 	{Zm_r_xm, 2, argList{Yxm, Yxr}},
 	{Zm_r_xm, 2, argList{Yxm, Ymr}},
 }

 var yxcvm2 = []ytab{
 	{Zm_r_xm, 2, argList{Yxm, Yxr}},
 	{Zm_r_xm, 2, argList{Ymm, Yxr}},
 }

 var yxr = []ytab{
 	{Zm_r_xm, 1, argList{Yxr, Yxr}},
 }

 var yxr_ml = []ytab{
 	{Zr_m_xm, 1, argList{Yxr, Yml}},
 }

 var ymr = []ytab{
 	{Zm_r, 1, argList{Ymr, Ymr}},
 }

 var ymr_ml = []ytab{
 	{Zr_m_xm, 1, argList{Ymr, Yml}},
 }

 var yxcmpi = []ytab{
 	{Zm_r_i_xm, 2, argList{Yxm, Yxr, Yi8}},
 }

 var yxmov = []ytab{
 	{Zm_r_xm, 1, argList{Yxm, Yxr}},
 	{Zr_m_xm, 1, argList{Yxr, Yxm}},
 }

 var yxcvfl = []ytab{
 	{Zm_r_xm, 1, argList{Yxm, Yrl}},
 }

 var yxcvlf = []ytab{
 	{Zm_r_xm, 1, argList{Yml, Yxr}},
 }

 var yxcvfq = []ytab{
 	{Zm_r_xm, 2, argList{Yxm, Yrl}},
 }

 var yxcvqf = []ytab{
 	{Zm_r_xm, 2, argList{Yml, Yxr}},
 }

 var yps = []ytab{
 	{Zm_r_xm, 1, argList{Ymm, Ymr}},
 	{Zibo_m_xm, 2, argList{Yi8, Ymr}},
 	{Zm_r_xm, 2, argList{Yxm, Yxr}},
 	{Zibo_m_xm, 3, argList{Yi8, Yxr}},
 }

 var yxrrl = []ytab{
 	{Zm_r, 1, argList{Yxr, Yrl}},
 }

 var ymrxr = []ytab{
 	{Zm_r, 1, argList{Ymr, Yxr}},
 	{Zm_r_xm, 1, argList{Yxm, Yxr}},
 }

 var ymshuf = []ytab{
 	{Zibm_r, 2, argList{Yi8, Ymm, Ymr}},
 }

 var ymshufb = []ytab{
 	{Zm2_r, 2, argList{Yxm, Yxr}},
 }

 // It should never have more than 1 entry,
 // because some optab entries you opcode secuences that
 // are longer than 2 bytes (zoffset=2 here),
 // ROUNDPD and ROUNDPS and recently added BLENDPD,
 // to name a few.
 var yxshuf = []ytab{
 	{Zibm_r, 2, argList{Yu8, Yxm, Yxr}},
 }

 var yextrw = []ytab{
 	{Zibm_r, 2, argList{Yu8, Yxr, Yrl}},
 	{Zibr_m, 2, argList{Yu8, Yxr, Yml}},
 }

 var yextr = []ytab{
 	{Zibr_m, 3, argList{Yu8, Yxr, Ymm}},
 }

 var yinsrw = []ytab{
 	{Zibm_r, 2, argList{Yu8, Yml, Yxr}},
 }

 var yinsr = []ytab{
 	{Zibm_r, 3, argList{Yu8, Ymm, Yxr}},
 }

 var ypsdq = []ytab{
 	{Zibo_m, 2, argList{Yi8, Yxr}},
 }

 var ymskb = []ytab{
 	{Zm_r_xm, 2, argList{Yxr, Yrl}},
 	{Zm_r_xm, 1, argList{Ymr, Yrl}},
 }

 var ycrc32l = []ytab{
 	{Zlitm_r, 0, argList{Yml, Yrl}},
 }

 var ycrc32b = []ytab{
 	{Zlitm_r, 0, argList{Ymb, Yrl}},
 }

 var yprefetch = []ytab{
 	{Zm_o, 2, argList{Ym}},
 }

 var yaes = []ytab{
 	{Zlitm_r, 2, argList{Yxm, Yxr}},
 }

 var yxbegin = []ytab{
 	{Zjmp, 1, argList{Ybr}},
 }

 var yxabort = []ytab{
 	{Zib_, 1, argList{Yu8}},
 }

 var ylddqu = []ytab{
 	{Zm_r, 1, argList{Ym, Yxr}},
 }

 var ypalignr = []ytab{
 	{Zibm_r, 2, argList{Yu8, Yxm, Yxr}},
 }

 var ysha256rnds2 = []ytab{
 	{Zlit_m_r, 0, argList{Yxr0, Yxm, Yxr}},
 }

 var yblendvpd = []ytab{
 	{Z_m_r, 1, argList{Yxr0, Yxm, Yxr}},
 }

 var ymmxmm0f38 = []ytab{
 	{Zlitm_r, 3, argList{Ymm, Ymr}},
 	{Zlitm_r, 5, argList{Yxm, Yxr}},
 }

 var yextractps = []ytab{
 	{Zibr_m, 2, argList{Yu2, Yxr, Yml}},
 }

 var ysha1rnds4 = []ytab{
 	{Zibm_r, 2, argList{Yu2, Yxm, Yxr}},
 }

 // You are doasm, holding in your hand a *obj.Prog with p.As set to, say,
 // ACRC32, and p.From and p.To as operands (obj.Addr).  The linker scans optab
 // to find the entry with the given p.As and then looks through the ytable for
 // that instruction (the second field in the optab struct) for a line whose
 // first two values match the Ytypes of the p.From and p.To operands.  The
 // function oclass computes the specific Ytype of an operand and then the set
 // of more general Ytypes that it satisfies is implied by the ycover table, set
 // up in instinit.  For example, oclass distinguishes the constants 0 and 1
 // from the more general 8-bit constants, but instinit says
 //
 //        ycover[Yi0*Ymax+Ys32] = 1
 //        ycover[Yi1*Ymax+Ys32] = 1
 //        ycover[Yi8*Ymax+Ys32] = 1
 //
 // which means that Yi0, Yi1, and Yi8 all count as Ys32 (signed 32)
 // if that's what an instruction can handle.
 //
 // In parallel with the scan through the ytable for the appropriate line, there
 // is a z pointer that starts out pointing at the strange magic byte list in
 // the Optab struct.  With each step past a non-matching ytable line, z
 // advances by the 4th entry in the line.  When a matching line is found, that
 // z pointer has the extra data to use in laying down the instruction bytes.
 // The actual bytes laid down are a function of the 3rd entry in the line (that
 // is, the Ztype) and the z bytes.
 //
 // For example, let's look at AADDL.  The optab line says:
 //        {AADDL, yaddl, Px, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
 //
 // and yaddl says
 //        var yaddl = []ytab{
 //                {Yi8, Ynone, Yml, Zibo_m, 2},
 //                {Yi32, Ynone, Yax, Zil_, 1},
 //                {Yi32, Ynone, Yml, Zilo_m, 2},
 //                {Yrl, Ynone, Yml, Zr_m, 1},
 //                {Yml, Ynone, Yrl, Zm_r, 1},
 //        }
 //
 // so there are 5 possible types of ADDL instruction that can be laid down, and
 // possible states used to lay them down (Ztype and z pointer, assuming z
 // points at opBytes{0x83, 00, 0x05,0x81, 00, 0x01, 0x03}) are:
 //
 //        Yi8, Yml -> Zibo_m, z (0x83, 00)
 //        Yi32, Yax -> Zil_, z+2 (0x05)
 //        Yi32, Yml -> Zilo_m, z+2+1 (0x81, 0x00)
 //        Yrl, Yml -> Zr_m, z+2+1+2 (0x01)
 //        Yml, Yrl -> Zm_r, z+2+1+2+1 (0x03)
 //
 // The Pconstant in the optab line controls the prefix bytes to emit.  That's
 // relatively straightforward as this program goes.
 //
 // The switch on yt.zcase in doasm implements the various Z cases.  Zibo_m, for
 // example, is an opcode byte (z[0]) then an asmando (which is some kind of
 // encoded addressing mode for the Yml arg), and then a single immediate byte.
 // Zilo_m is the same but a long (32-bit) immediate.
 var optab =
 //	as, ytab, andproto, opcode
 [...]Optab{
 	{obj.AXXX, nil, 0, opBytes{}},
 	{AAAA, ynone, P32, opBytes{0x37}},
 	{AAAD, ynone, P32, opBytes{0xd5, 0x0a}},
 	{AAAM, ynone, P32, opBytes{0xd4, 0x0a}},
 	{AAAS, ynone, P32, opBytes{0x3f}},
 	{AADCB, yxorb, Pb, opBytes{0x14, 0x80, 02, 0x10, 0x12}},
 	{AADCL, yaddl, Px, opBytes{0x83, 02, 0x15, 0x81, 02, 0x11, 0x13}},
 	{AADCQ, yaddl, Pw, opBytes{0x83, 02, 0x15, 0x81, 02, 0x11, 0x13}},
 	{AADCW, yaddl, Pe, opBytes{0x83, 02, 0x15, 0x81, 02, 0x11, 0x13}},
 	{AADCXL, yml_rl, Pq4, opBytes{0xf6}},
 	{AADCXQ, yml_rl, Pq4w, opBytes{0xf6}},
 	{AADDB, yxorb, Pb, opBytes{0x04, 0x80, 00, 0x00, 0x02}},
 	{AADDL, yaddl, Px, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
 	{AADDPD, yxm, Pq, opBytes{0x58}},
 	{AADDPS, yxm, Pm, opBytes{0x58}},
 	{AADDQ, yaddl, Pw, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
 	{AADDSD, yxm, Pf2, opBytes{0x58}},
 	{AADDSS, yxm, Pf3, opBytes{0x58}},
 	{AADDSUBPD, yxm, Pq, opBytes{0xd0}},
 	{AADDSUBPS, yxm, Pf2, opBytes{0xd0}},
 	{AADDW, yaddl, Pe, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
 	{AADOXL, yml_rl, Pq5, opBytes{0xf6}},
 	{AADOXQ, yml_rl, Pq5w, opBytes{0xf6}},
 	{AADJSP, nil, 0, opBytes{}},
 	{AANDB, yxorb, Pb, opBytes{0x24, 0x80, 04, 0x20, 0x22}},
 	{AANDL, yaddl, Px, opBytes{0x83, 04, 0x25, 0x81, 04, 0x21, 0x23}},
 	{AANDNPD, yxm, Pq, opBytes{0x55}},
 	{AANDNPS, yxm, Pm, opBytes{0x55}},
 	{AANDPD, yxm, Pq, opBytes{0x54}},
 	{AANDPS, yxm, Pm, opBytes{0x54}},
 	{AANDQ, yaddl, Pw, opBytes{0x83, 04, 0x25, 0x81, 04, 0x21, 0x23}},
 	{AANDW, yaddl, Pe, opBytes{0x83, 04, 0x25, 0x81, 04, 0x21, 0x23}},
 	{AARPL, yrl_ml, P32, opBytes{0x63}},
 	{ABOUNDL, yrl_m, P32, opBytes{0x62}},
 	{ABOUNDW, yrl_m, Pe, opBytes{0x62}},
 	{ABSFL, yml_rl, Pm, opBytes{0xbc}},
 	{ABSFQ, yml_rl, Pw, opBytes{0x0f, 0xbc}},
 	{ABSFW, yml_rl, Pq, opBytes{0xbc}},
 	{ABSRL, yml_rl, Pm, opBytes{0xbd}},
 	{ABSRQ, yml_rl, Pw, opBytes{0x0f, 0xbd}},
 	{ABSRW, yml_rl, Pq, opBytes{0xbd}},
 	{ABSWAPL, ybswap, Px, opBytes{0x0f, 0xc8}},
 	{ABSWAPQ, ybswap, Pw, opBytes{0x0f, 0xc8}},
 	{ABTCL, ybtl, Pm, opBytes{0xba, 07, 0xbb}},
 	{ABTCQ, ybtl, Pw, opBytes{0x0f, 0xba, 07, 0x0f, 0xbb}},
 	{ABTCW, ybtl, Pq, opBytes{0xba, 07, 0xbb}},
 	{ABTL, ybtl, Pm, opBytes{0xba, 04, 0xa3}},
 	{ABTQ, ybtl, Pw, opBytes{0x0f, 0xba, 04, 0x0f, 0xa3}},
 	{ABTRL, ybtl, Pm, opBytes{0xba, 06, 0xb3}},
 	{ABTRQ, ybtl, Pw, opBytes{0x0f, 0xba, 06, 0x0f, 0xb3}},
 	{ABTRW, ybtl, Pq, opBytes{0xba, 06, 0xb3}},
 	{ABTSL, ybtl, Pm, opBytes{0xba, 05, 0xab}},
 	{ABTSQ, ybtl, Pw, opBytes{0x0f, 0xba, 05, 0x0f, 0xab}},
 	{ABTSW, ybtl, Pq, opBytes{0xba, 05, 0xab}},
 	{ABTW, ybtl, Pq, opBytes{0xba, 04, 0xa3}},
 	{ABYTE, ybyte, Px, opBytes{1}},
 	{obj.ACALL, ycall, Px, opBytes{0xff, 02, 0xff, 0x15, 0xe8}},
 	{ACBW, ynone, Pe, opBytes{0x98}},
 	{ACDQ, ynone, Px, opBytes{0x99}},
 	{ACDQE, ynone, Pw, opBytes{0x98}},
 	{ACLAC, ynone, Pm, opBytes{01, 0xca}},
 	{ACLC, ynone, Px, opBytes{0xf8}},
 	{ACLD, ynone, Px, opBytes{0xfc}},
 	{ACLDEMOTE, yclflush, Pm, opBytes{0x1c, 00}},
 	{ACLFLUSH, yclflush, Pm, opBytes{0xae, 07}},
 	{ACLFLUSHOPT, yclflush, Pq, opBytes{0xae, 07}},
 	{ACLI, ynone, Px, opBytes{0xfa}},
 	{ACLTS, ynone, Pm, opBytes{0x06}},
 	{ACLWB, yclflush, Pq, opBytes{0xae, 06}},
 	{ACMC, ynone, Px, opBytes{0xf5}},
 	{ACMOVLCC, yml_rl, Pm, opBytes{0x43}},
 	{ACMOVLCS, yml_rl, Pm, opBytes{0x42}},
 	{ACMOVLEQ, yml_rl, Pm, opBytes{0x44}},
 	{ACMOVLGE, yml_rl, Pm, opBytes{0x4d}},
 	{ACMOVLGT, yml_rl, Pm, opBytes{0x4f}},
 	{ACMOVLHI, yml_rl, Pm, opBytes{0x47}},
 	{ACMOVLLE, yml_rl, Pm, opBytes{0x4e}},
 	{ACMOVLLS, yml_rl, Pm, opBytes{0x46}},
 	{ACMOVLLT, yml_rl, Pm, opBytes{0x4c}},
 	{ACMOVLMI, yml_rl, Pm, opBytes{0x48}},
 	{ACMOVLNE, yml_rl, Pm, opBytes{0x45}},
 	{ACMOVLOC, yml_rl, Pm, opBytes{0x41}},
 	{ACMOVLOS, yml_rl, Pm, opBytes{0x40}},
 	{ACMOVLPC, yml_rl, Pm, opBytes{0x4b}},
 	{ACMOVLPL, yml_rl, Pm, opBytes{0x49}},
 	{ACMOVLPS, yml_rl, Pm, opBytes{0x4a}},
 	{ACMOVQCC, yml_rl, Pw, opBytes{0x0f, 0x43}},
 	{ACMOVQCS, yml_rl, Pw, opBytes{0x0f, 0x42}},
 	{ACMOVQEQ, yml_rl, Pw, opBytes{0x0f, 0x44}},
 	{ACMOVQGE, yml_rl, Pw, opBytes{0x0f, 0x4d}},
 	{ACMOVQGT, yml_rl, Pw, opBytes{0x0f, 0x4f}},
 	{ACMOVQHI, yml_rl, Pw, opBytes{0x0f, 0x47}},
 	{ACMOVQLE, yml_rl, Pw, opBytes{0x0f, 0x4e}},
 	{ACMOVQLS, yml_rl, Pw, opBytes{0x0f, 0x46}},
 	{ACMOVQLT, yml_rl, Pw, opBytes{0x0f, 0x4c}},
 	{ACMOVQMI, yml_rl, Pw, opBytes{0x0f, 0x48}},
 	{ACMOVQNE, yml_rl, Pw, opBytes{0x0f, 0x45}},
 	{ACMOVQOC, yml_rl, Pw, opBytes{0x0f, 0x41}},
 	{ACMOVQOS, yml_rl, Pw, opBytes{0x0f, 0x40}},
 	{ACMOVQPC, yml_rl, Pw, opBytes{0x0f, 0x4b}},
 	{ACMOVQPL, yml_rl, Pw, opBytes{0x0f, 0x49}},
 	{ACMOVQPS, yml_rl, Pw, opBytes{0x0f, 0x4a}},
 	{ACMOVWCC, yml_rl, Pq, opBytes{0x43}},
 	{ACMOVWCS, yml_rl, Pq, opBytes{0x42}},
 	{ACMOVWEQ, yml_rl, Pq, opBytes{0x44}},
 	{ACMOVWGE, yml_rl, Pq, opBytes{0x4d}},
 	{ACMOVWGT, yml_rl, Pq, opBytes{0x4f}},
 	{ACMOVWHI, yml_rl, Pq, opBytes{0x47}},
 	{ACMOVWLE, yml_rl, Pq, opBytes{0x4e}},
 	{ACMOVWLS, yml_rl, Pq, opBytes{0x46}},
 	{ACMOVWLT, yml_rl, Pq, opBytes{0x4c}},
 	{ACMOVWMI, yml_rl, Pq, opBytes{0x48}},
 	{ACMOVWNE, yml_rl, Pq, opBytes{0x45}},
 	{ACMOVWOC, yml_rl, Pq, opBytes{0x41}},
 	{ACMOVWOS, yml_rl, Pq, opBytes{0x40}},
 	{ACMOVWPC, yml_rl, Pq, opBytes{0x4b}},
 	{ACMOVWPL, yml_rl, Pq, opBytes{0x49}},
 	{ACMOVWPS, yml_rl, Pq, opBytes{0x4a}},
 	{ACMPB, ycmpb, Pb, opBytes{0x3c, 0x80, 07, 0x38, 0x3a}},
 	{ACMPL, ycmpl, Px, opBytes{0x83, 07, 0x3d, 0x81, 07, 0x39, 0x3b}},
 	{ACMPPD, yxcmpi, Px, opBytes{Pe, 0xc2}},
 	{ACMPPS, yxcmpi, Pm, opBytes{0xc2, 0}},
 	{ACMPQ, ycmpl, Pw, opBytes{0x83, 07, 0x3d, 0x81, 07, 0x39, 0x3b}},
 	{ACMPSB, ynone, Pb, opBytes{0xa6}},
 	{ACMPSD, yxcmpi, Px, opBytes{Pf2, 0xc2}},
 	{ACMPSL, ynone, Px, opBytes{0xa7}},
 	{ACMPSQ, ynone, Pw, opBytes{0xa7}},
 	{ACMPSS, yxcmpi, Px, opBytes{Pf3, 0xc2}},
 	{ACMPSW, ynone, Pe, opBytes{0xa7}},
 	{ACMPW, ycmpl, Pe, opBytes{0x83, 07, 0x3d, 0x81, 07, 0x39, 0x3b}},
 	{ACOMISD, yxm, Pe, opBytes{0x2f}},
 	{ACOMISS, yxm, Pm, opBytes{0x2f}},
 	{ACPUID, ynone, Pm, opBytes{0xa2}},
 	{ACVTPL2PD, yxcvm2, Px, opBytes{Pf3, 0xe6, Pe, 0x2a}},
 	{ACVTPL2PS, yxcvm2, Pm, opBytes{0x5b, 0, 0x2a, 0}},
 	{ACVTPD2PL, yxcvm1, Px, opBytes{Pf2, 0xe6, Pe, 0x2d}},
 	{ACVTPD2PS, yxm, Pe, opBytes{0x5a}},
 	{ACVTPS2PL, yxcvm1, Px, opBytes{Pe, 0x5b, Pm, 0x2d}},
 	{ACVTPS2PD, yxm, Pm, opBytes{0x5a}},
 	{ACVTSD2SL, yxcvfl, Pf2, opBytes{0x2d}},
 	{ACVTSD2SQ, yxcvfq, Pw, opBytes{Pf2, 0x2d}},
 	{ACVTSD2SS, yxm, Pf2, opBytes{0x5a}},
 	{ACVTSL2SD, yxcvlf, Pf2, opBytes{0x2a}},
 	{ACVTSQ2SD, yxcvqf, Pw, opBytes{Pf2, 0x2a}},
 	{ACVTSL2SS, yxcvlf, Pf3, opBytes{0x2a}},
 	{ACVTSQ2SS, yxcvqf, Pw, opBytes{Pf3, 0x2a}},
 	{ACVTSS2SD, yxm, Pf3, opBytes{0x5a}},
 	{ACVTSS2SL, yxcvfl, Pf3, opBytes{0x2d}},
 	{ACVTSS2SQ, yxcvfq, Pw, opBytes{Pf3, 0x2d}},
 	{ACVTTPD2PL, yxcvm1, Px, opBytes{Pe, 0xe6, Pe, 0x2c}},
 	{ACVTTPS2PL, yxcvm1, Px, opBytes{Pf3, 0x5b, Pm, 0x2c}},
 	{ACVTTSD2SL, yxcvfl, Pf2, opBytes{0x2c}},
 	{ACVTTSD2SQ, yxcvfq, Pw, opBytes{Pf2, 0x2c}},
 	{ACVTTSS2SL, yxcvfl, Pf3, opBytes{0x2c}},
 	{ACVTTSS2SQ, yxcvfq, Pw, opBytes{Pf3, 0x2c}},
 	{ACWD, ynone, Pe, opBytes{0x99}},
 	{ACWDE, ynone, Px, opBytes{0x98}},
 	{ACQO, ynone, Pw, opBytes{0x99}},
 	{ADAA, ynone, P32, opBytes{0x27}},
 	{ADAS, ynone, P32, opBytes{0x2f}},
 	{ADECB, yscond, Pb, opBytes{0xfe, 01}},
 	{ADECL, yincl, Px1, opBytes{0x48, 0xff, 01}},
 	{ADECQ, yincq, Pw, opBytes{0xff, 01}},
 	{ADECW, yincq, Pe, opBytes{0xff, 01}},
 	{ADIVB, ydivb, Pb, opBytes{0xf6, 06}},
 	{ADIVL, ydivl, Px, opBytes{0xf7, 06}},
 	{ADIVPD, yxm, Pe, opBytes{0x5e}},
 	{ADIVPS, yxm, Pm, opBytes{0x5e}},
 	{ADIVQ, ydivl, Pw, opBytes{0xf7, 06}},
 	{ADIVSD, yxm, Pf2, opBytes{0x5e}},
 	{ADIVSS, yxm, Pf3, opBytes{0x5e}},
 	{ADIVW, ydivl, Pe, opBytes{0xf7, 06}},
 	{ADPPD, yxshuf, Pq, opBytes{0x3a, 0x41, 0}},
 	{ADPPS, yxshuf, Pq, opBytes{0x3a, 0x40, 0}},
 	{AEMMS, ynone, Pm, opBytes{0x77}},
 	{AEXTRACTPS, yextractps, Pq, opBytes{0x3a, 0x17, 0}},
 	{AENTER, nil, 0, opBytes{}}, // botch
 	{AFXRSTOR, ysvrs_mo, Pm, opBytes{0xae, 01, 0xae, 01}},
 	{AFXSAVE, ysvrs_om, Pm, opBytes{0xae, 00, 0xae, 00}},
 	{AFXRSTOR64, ysvrs_mo, Pw, opBytes{0x0f, 0xae, 01, 0x0f, 0xae, 01}},
 	{AFXSAVE64, ysvrs_om, Pw, opBytes{0x0f, 0xae, 00, 0x0f, 0xae, 00}},
 	{AHLT, ynone, Px, opBytes{0xf4}},
 	{AIDIVB, ydivb, Pb, opBytes{0xf6, 07}},
 	{AIDIVL, ydivl, Px, opBytes{0xf7, 07}},
 	{AIDIVQ, ydivl, Pw, opBytes{0xf7, 07}},
 	{AIDIVW, ydivl, Pe, opBytes{0xf7, 07}},
 	{AIMULB, ydivb, Pb, opBytes{0xf6, 05}},
 	{AIMULL, yimul, Px, opBytes{0xf7, 05, 0x6b, 0x69, Pm, 0xaf}},
 	{AIMULQ, yimul, Pw, opBytes{0xf7, 05, 0x6b, 0x69, Pm, 0xaf}},
 	{AIMULW, yimul, Pe, opBytes{0xf7, 05, 0x6b, 0x69, Pm, 0xaf}},
 	{AIMUL3W, yimul3, Pe, opBytes{0x6b, 00, 0x69, 00}},
 	{AIMUL3L, yimul3, Px, opBytes{0x6b, 00, 0x69, 00}},
 	{AIMUL3Q, yimul3, Pw, opBytes{0x6b, 00, 0x69, 00}},
 	{AINB, yin, Pb, opBytes{0xe4, 0xec}},
 	{AINW, yin, Pe, opBytes{0xe5, 0xed}},
 	{AINL, yin, Px, opBytes{0xe5, 0xed}},
 	{AINCB, yscond, Pb, opBytes{0xfe, 00}},
 	{AINCL, yincl, Px1, opBytes{0x40, 0xff, 00}},
 	{AINCQ, yincq, Pw, opBytes{0xff, 00}},
 	{AINCW, yincq, Pe, opBytes{0xff, 00}},
 	{AINSB, ynone, Pb, opBytes{0x6c}},
 	{AINSL, ynone, Px, opBytes{0x6d}},
 	{AINSERTPS, yxshuf, Pq, opBytes{0x3a, 0x21, 0}},
 	{AINSW, ynone, Pe, opBytes{0x6d}},
 	{AICEBP, ynone, Px, opBytes{0xf1}},
 	{AINT, yint, Px, opBytes{0xcd}},
 	{AINTO, ynone, P32, opBytes{0xce}},
 	{AIRETL, ynone, Px, opBytes{0xcf}},
 	{AIRETQ, ynone, Pw, opBytes{0xcf}},
 	{AIRETW, ynone, Pe, opBytes{0xcf}},
 	{AJCC, yjcond, Px, opBytes{0x73, 0x83, 00}},
 	{AJCS, yjcond, Px, opBytes{0x72, 0x82}},
 	{AJCXZL, yloop, Px, opBytes{0xe3}},
 	{AJCXZW, yloop, Px, opBytes{0xe3}},
 	{AJCXZQ, yloop, Px, opBytes{0xe3}},
 	{AJEQ, yjcond, Px, opBytes{0x74, 0x84}},
 	{AJGE, yjcond, Px, opBytes{0x7d, 0x8d}},
 	{AJGT, yjcond, Px, opBytes{0x7f, 0x8f}},
 	{AJHI, yjcond, Px, opBytes{0x77, 0x87}},
 	{AJLE, yjcond, Px, opBytes{0x7e, 0x8e}},
 	{AJLS, yjcond, Px, opBytes{0x76, 0x86}},
 	{AJLT, yjcond, Px, opBytes{0x7c, 0x8c}},
 	{AJMI, yjcond, Px, opBytes{0x78, 0x88}},
 	{obj.AJMP, yjmp, Px, opBytes{0xff, 04, 0xeb, 0xe9}},
 	{AJNE, yjcond, Px, opBytes{0x75, 0x85}},
 	{AJOC, yjcond, Px, opBytes{0x71, 0x81, 00}},
 	{AJOS, yjcond, Px, opBytes{0x70, 0x80, 00}},
 	{AJPC, yjcond, Px, opBytes{0x7b, 0x8b}},
 	{AJPL, yjcond, Px, opBytes{0x79, 0x89}},
 	{AJPS, yjcond, Px, opBytes{0x7a, 0x8a}},
 	{AHADDPD, yxm, Pq, opBytes{0x7c}},
 	{AHADDPS, yxm, Pf2, opBytes{0x7c}},
 	{AHSUBPD, yxm, Pq, opBytes{0x7d}},
 	{AHSUBPS, yxm, Pf2, opBytes{0x7d}},
 	{ALAHF, ynone, Px, opBytes{0x9f}},
 	{ALARL, yml_rl, Pm, opBytes{0x02}},
 	{ALARQ, yml_rl, Pw, opBytes{0x0f, 0x02}},
 	{ALARW, yml_rl, Pq, opBytes{0x02}},
 	{ALDDQU, ylddqu, Pf2, opBytes{0xf0}},
 	{ALDMXCSR, ysvrs_mo, Pm, opBytes{0xae, 02, 0xae, 02}},
 	{ALEAL, ym_rl, Px, opBytes{0x8d}},
 	{ALEAQ, ym_rl, Pw, opBytes{0x8d}},
 	{ALEAVEL, ynone, P32, opBytes{0xc9}},
 	{ALEAVEQ, ynone, Py, opBytes{0xc9}},
 	{ALEAVEW, ynone, Pe, opBytes{0xc9}},
 	{ALEAW, ym_rl, Pe, opBytes{0x8d}},
 	{ALOCK, ynone, Px, opBytes{0xf0}},
 	{ALODSB, ynone, Pb, opBytes{0xac}},
 	{ALODSL, ynone, Px, opBytes{0xad}},
 	{ALODSQ, ynone, Pw, opBytes{0xad}},
 	{ALODSW, ynone, Pe, opBytes{0xad}},
 	{ALONG, ybyte, Px, opBytes{4}},
 	{ALOOP, yloop, Px, opBytes{0xe2}},
 	{ALOOPEQ, yloop, Px, opBytes{0xe1}},
 	{ALOOPNE, yloop, Px, opBytes{0xe0}},
 	{ALTR, ydivl, Pm, opBytes{0x00, 03}},
 	{ALZCNTL, yml_rl, Pf3, opBytes{0xbd}},
 	{ALZCNTQ, yml_rl, Pfw, opBytes{0xbd}},
 	{ALZCNTW, yml_rl, Pef3, opBytes{0xbd}},
 	{ALSLL, yml_rl, Pm, opBytes{0x03}},
 	{ALSLW, yml_rl, Pq, opBytes{0x03}},
 	{ALSLQ, yml_rl, Pw, opBytes{0x0f, 0x03}},
 	{AMASKMOVOU, yxr, Pe, opBytes{0xf7}},
 	{AMASKMOVQ, ymr, Pm, opBytes{0xf7}},
 	{AMAXPD, yxm, Pe, opBytes{0x5f}},
 	{AMAXPS, yxm, Pm, opBytes{0x5f}},
 	{AMAXSD, yxm, Pf2, opBytes{0x5f}},
 	{AMAXSS, yxm, Pf3, opBytes{0x5f}},
 	{AMINPD, yxm, Pe, opBytes{0x5d}},
 	{AMINPS, yxm, Pm, opBytes{0x5d}},
 	{AMINSD, yxm, Pf2, opBytes{0x5d}},
 	{AMINSS, yxm, Pf3, opBytes{0x5d}},
 	{AMONITOR, ynone, Px, opBytes{0x0f, 0x01, 0xc8, 0}},
 	{AMWAIT, ynone, Px, opBytes{0x0f, 0x01, 0xc9, 0}},
 	{AMOVAPD, yxmov, Pe, opBytes{0x28, 0x29}},
 	{AMOVAPS, yxmov, Pm, opBytes{0x28, 0x29}},
 	{AMOVB, ymovb, Pb, opBytes{0x88, 0x8a, 0xb0, 0xc6, 00}},
 	{AMOVBLSX, ymb_rl, Pm, opBytes{0xbe}},
 	{AMOVBLZX, ymb_rl, Pm, opBytes{0xb6}},
 	{AMOVBQSX, ymb_rl, Pw, opBytes{0x0f, 0xbe}},
 	{AMOVBQZX, ymb_rl, Pw, opBytes{0x0f, 0xb6}},
 	{AMOVBWSX, ymb_rl, Pq, opBytes{0xbe}},
 	{AMOVSWW, ymb_rl, Pe, opBytes{0x0f, 0xbf}},
 	{AMOVBWZX, ymb_rl, Pq, opBytes{0xb6}},
 	{AMOVZWW, ymb_rl, Pe, opBytes{0x0f, 0xb7}},
 	{AMOVO, yxmov, Pe, opBytes{0x6f, 0x7f}},
 	{AMOVOU, yxmov, Pf3, opBytes{0x6f, 0x7f}},
 	{AMOVHLPS, yxr, Pm, opBytes{0x12}},
 	{AMOVHPD, yxmov, Pe, opBytes{0x16, 0x17}},
 	{AMOVHPS, yxmov, Pm, opBytes{0x16, 0x17}},
 	{AMOVL, ymovl, Px, opBytes{0x89, 0x8b, 0xb8, 0xc7, 00, 0x6e, 0x7e, Pe, 0x6e, Pe, 0x7e, 0}},
 	{AMOVLHPS, yxr, Pm, opBytes{0x16}},
 	{AMOVLPD, yxmov, Pe, opBytes{0x12, 0x13}},
 	{AMOVLPS, yxmov, Pm, opBytes{0x12, 0x13}},
 	{AMOVLQSX, yml_rl, Pw, opBytes{0x63}},
 	{AMOVLQZX, yml_rl, Px, opBytes{0x8b}},
 	{AMOVMSKPD, yxrrl, Pq, opBytes{0x50}},
 	{AMOVMSKPS, yxrrl, Pm, opBytes{0x50}},
 	{AMOVNTO, yxr_ml, Pe, opBytes{0xe7}},
 	{AMOVNTDQA, ylddqu, Pq4, opBytes{0x2a}},
 	{AMOVNTPD, yxr_ml, Pe, opBytes{0x2b}},
 	{AMOVNTPS, yxr_ml, Pm, opBytes{0x2b}},
 	{AMOVNTQ, ymr_ml, Pm, opBytes{0xe7}},
 	{AMOVQ, ymovq, Pw8, opBytes{0x6f, 0x7f, Pf2, 0xd6, Pf3, 0x7e, Pe, 0xd6, 0x89, 0x8b, 0xc7, 00, 0xb8, 0xc7, 00, 0x6e, 0x7e, Pe, 0x6e, Pe, 0x7e, 0}},
 	{AMOVQOZX, ymrxr, Pf3, opBytes{0xd6, 0x7e}},
 	{AMOVSB, ynone, Pb, opBytes{0xa4}},
 	{AMOVSD, yxmov, Pf2, opBytes{0x10, 0x11}},
 	{AMOVSL, ynone, Px, opBytes{0xa5}},
 	{AMOVSQ, ynone, Pw, opBytes{0xa5}},
 	{AMOVSS, yxmov, Pf3, opBytes{0x10, 0x11}},
 	{AMOVSW, ynone, Pe, opBytes{0xa5}},
 	{AMOVUPD, yxmov, Pe, opBytes{0x10, 0x11}},
 	{AMOVUPS, yxmov, Pm, opBytes{0x10, 0x11}},
 	{AMOVW, ymovw, Pe, opBytes{0x89, 0x8b, 0xb8, 0xc7, 00, 0}},
 	{AMOVWLSX, yml_rl, Pm, opBytes{0xbf}},
 	{AMOVWLZX, yml_rl, Pm, opBytes{0xb7}},
 	{AMOVWQSX, yml_rl, Pw, opBytes{0x0f, 0xbf}},
 	{AMOVWQZX, yml_rl, Pw, opBytes{0x0f, 0xb7}},
 	{AMPSADBW, yxshuf, Pq, opBytes{0x3a, 0x42, 0}},
 	{AMULB, ydivb, Pb, opBytes{0xf6, 04}},
 	{AMULL, ydivl, Px, opBytes{0xf7, 04}},
 	{AMULPD, yxm, Pe, opBytes{0x59}},
 	{AMULPS, yxm, Ym, opBytes{0x59}},
 	{AMULQ, ydivl, Pw, opBytes{0xf7, 04}},
 	{AMULSD, yxm, Pf2, opBytes{0x59}},
 	{AMULSS, yxm, Pf3, opBytes{0x59}},
 	{AMULW, ydivl, Pe, opBytes{0xf7, 04}},
 	{ANEGB, yscond, Pb, opBytes{0xf6, 03}},
 	{ANEGL, yscond, Px, opBytes{0xf7, 03}},
 	{ANEGQ, yscond, Pw, opBytes{0xf7, 03}},
 	{ANEGW, yscond, Pe, opBytes{0xf7, 03}},
 	{obj.ANOP, ynop, Px, opBytes{0, 0}},
 	{ANOTB, yscond, Pb, opBytes{0xf6, 02}},
 	{ANOTL, yscond, Px, opBytes{0xf7, 02}}, // TODO(rsc): yscond is wrong here.
 	{ANOTQ, yscond, Pw, opBytes{0xf7, 02}},
 	{ANOTW, yscond, Pe, opBytes{0xf7, 02}},
 	{AORB, yxorb, Pb, opBytes{0x0c, 0x80, 01, 0x08, 0x0a}},
 	{AORL, yaddl, Px, opBytes{0x83, 01, 0x0d, 0x81, 01, 0x09, 0x0b}},
 	{AORPD, yxm, Pq, opBytes{0x56}},
 	{AORPS, yxm, Pm, opBytes{0x56}},
 	{AORQ, yaddl, Pw, opBytes{0x83, 01, 0x0d, 0x81, 01, 0x09, 0x0b}},
 	{AORW, yaddl, Pe, opBytes{0x83, 01, 0x0d, 0x81, 01, 0x09, 0x0b}},
 	{AOUTB, yin, Pb, opBytes{0xe6, 0xee}},
 	{AOUTL, yin, Px, opBytes{0xe7, 0xef}},
 	{AOUTW, yin, Pe, opBytes{0xe7, 0xef}},
 	{AOUTSB, ynone, Pb, opBytes{0x6e}},
 	{AOUTSL, ynone, Px, opBytes{0x6f}},
 	{AOUTSW, ynone, Pe, opBytes{0x6f}},
 	{APABSB, yxm_q4, Pq4, opBytes{0x1c}},
 	{APABSD, yxm_q4, Pq4, opBytes{0x1e}},
 	{APABSW, yxm_q4, Pq4, opBytes{0x1d}},
 	{APACKSSLW, ymm, Py1, opBytes{0x6b, Pe, 0x6b}},
 	{APACKSSWB, ymm, Py1, opBytes{0x63, Pe, 0x63}},
 	{APACKUSDW, yxm_q4, Pq4, opBytes{0x2b}},
 	{APACKUSWB, ymm, Py1, opBytes{0x67, Pe, 0x67}},
 	{APADDB, ymm, Py1, opBytes{0xfc, Pe, 0xfc}},
 	{APADDL, ymm, Py1, opBytes{0xfe, Pe, 0xfe}},
 	{APADDQ, yxm, Pe, opBytes{0xd4}},
 	{APADDSB, ymm, Py1, opBytes{0xec, Pe, 0xec}},
 	{APADDSW, ymm, Py1, opBytes{0xed, Pe, 0xed}},
 	{APADDUSB, ymm, Py1, opBytes{0xdc, Pe, 0xdc}},
 	{APADDUSW, ymm, Py1, opBytes{0xdd, Pe, 0xdd}},
 	{APADDW, ymm, Py1, opBytes{0xfd, Pe, 0xfd}},
 	{APALIGNR, ypalignr, Pq, opBytes{0x3a, 0x0f}},
 	{APAND, ymm, Py1, opBytes{0xdb, Pe, 0xdb}},
 	{APANDN, ymm, Py1, opBytes{0xdf, Pe, 0xdf}},
 	{APAUSE, ynone, Px, opBytes{0xf3, 0x90}},
 	{APAVGB, ymm, Py1, opBytes{0xe0, Pe, 0xe0}},
 	{APAVGW, ymm, Py1, opBytes{0xe3, Pe, 0xe3}},
 	{APBLENDW, yxshuf, Pq, opBytes{0x3a, 0x0e, 0}},
 	{APCMPEQB, ymm, Py1, opBytes{0x74, Pe, 0x74}},
 	{APCMPEQL, ymm, Py1, opBytes{0x76, Pe, 0x76}},
 	{APCMPEQQ, yxm_q4, Pq4, opBytes{0x29}},
 	{APCMPEQW, ymm, Py1, opBytes{0x75, Pe, 0x75}},
 	{APCMPGTB, ymm, Py1, opBytes{0x64, Pe, 0x64}},
 	{APCMPGTL, ymm, Py1, opBytes{0x66, Pe, 0x66}},
 	{APCMPGTQ, yxm_q4, Pq4, opBytes{0x37}},
 	{APCMPGTW, ymm, Py1, opBytes{0x65, Pe, 0x65}},
 	{APCMPISTRI, yxshuf, Pq, opBytes{0x3a, 0x63, 0}},
 	{APCMPISTRM, yxshuf, Pq, opBytes{0x3a, 0x62, 0}},
 	{APEXTRW, yextrw, Pq, opBytes{0xc5, 0, 0x3a, 0x15, 0}},
 	{APEXTRB, yextr, Pq, opBytes{0x3a, 0x14, 00}},
 	{APEXTRD, yextr, Pq, opBytes{0x3a, 0x16, 00}},
 	{APEXTRQ, yextr, Pq3, opBytes{0x3a, 0x16, 00}},
 	{APHADDD, ymmxmm0f38, Px, opBytes{0x0F, 0x38, 0x02, 0, 0x66, 0x0F, 0x38, 0x02, 0}},
 	{APHADDSW, yxm_q4, Pq4, opBytes{0x03}},
 	{APHADDW, yxm_q4, Pq4, opBytes{0x01}},
 	{APHMINPOSUW, yxm_q4, Pq4, opBytes{0x41}},
 	{APHSUBD, yxm_q4, Pq4, opBytes{0x06}},
 	{APHSUBSW, yxm_q4, Pq4, opBytes{0x07}},
 	{APHSUBW, yxm_q4, Pq4, opBytes{0x05}},
 	{APINSRW, yinsrw, Pq, opBytes{0xc4, 00}},
 	{APINSRB, yinsr, Pq, opBytes{0x3a, 0x20, 00}},
 	{APINSRD, yinsr, Pq, opBytes{0x3a, 0x22, 00}},
 	{APINSRQ, yinsr, Pq3, opBytes{0x3a, 0x22, 00}},
 	{APMADDUBSW, yxm_q4, Pq4, opBytes{0x04}},
 	{APMADDWL, ymm, Py1, opBytes{0xf5, Pe, 0xf5}},
 	{APMAXSB, yxm_q4, Pq4, opBytes{0x3c}},
 	{APMAXSD, yxm_q4, Pq4, opBytes{0x3d}},
 	{APMAXSW, yxm, Pe, opBytes{0xee}},
 	{APMAXUB, yxm, Pe, opBytes{0xde}},
 	{APMAXUD, yxm_q4, Pq4, opBytes{0x3f}},
 	{APMAXUW, yxm_q4, Pq4, opBytes{0x3e}},
 	{APMINSB, yxm_q4, Pq4, opBytes{0x38}},
 	{APMINSD, yxm_q4, Pq4, opBytes{0x39}},
 	{APMINSW, yxm, Pe, opBytes{0xea}},
 	{APMINUB, yxm, Pe, opBytes{0xda}},
 	{APMINUD, yxm_q4, Pq4, opBytes{0x3b}},
 	{APMINUW, yxm_q4, Pq4, opBytes{0x3a}},
 	{APMOVMSKB, ymskb, Px, opBytes{Pe, 0xd7, 0xd7}},
 	{APMOVSXBD, yxm_q4, Pq4, opBytes{0x21}},
 	{APMOVSXBQ, yxm_q4, Pq4, opBytes{0x22}},
 	{APMOVSXBW, yxm_q4, Pq4, opBytes{0x20}},
 	{APMOVSXDQ, yxm_q4, Pq4, opBytes{0x25}},
 	{APMOVSXWD, yxm_q4, Pq4, opBytes{0x23}},
 	{APMOVSXWQ, yxm_q4, Pq4, opBytes{0x24}},
 	{APMOVZXBD, yxm_q4, Pq4, opBytes{0x31}},
 	{APMOVZXBQ, yxm_q4, Pq4, opBytes{0x32}},
 	{APMOVZXBW, yxm_q4, Pq4, opBytes{0x30}},
 	{APMOVZXDQ, yxm_q4, Pq4, opBytes{0x35}},
 	{APMOVZXWD, yxm_q4, Pq4, opBytes{0x33}},
 	{APMOVZXWQ, yxm_q4, Pq4, opBytes{0x34}},
 	{APMULDQ, yxm_q4, Pq4, opBytes{0x28}},
 	{APMULHRSW, yxm_q4, Pq4, opBytes{0x0b}},
 	{APMULHUW, ymm, Py1, opBytes{0xe4, Pe, 0xe4}},
 	{APMULHW, ymm, Py1, opBytes{0xe5, Pe, 0xe5}},
 	{APMULLD, yxm_q4, Pq4, opBytes{0x40}},
 	{APMULLW, ymm, Py1, opBytes{0xd5, Pe, 0xd5}},
 	{APMULULQ, ymm, Py1, opBytes{0xf4, Pe, 0xf4}},
 	{APOPAL, ynone, P32, opBytes{0x61}},
 	{APOPAW, ynone, Pe, opBytes{0x61}},
 	{APOPCNTW, yml_rl, Pef3, opBytes{0xb8}},
 	{APOPCNTL, yml_rl, Pf3, opBytes{0xb8}},
 	{APOPCNTQ, yml_rl, Pfw, opBytes{0xb8}},
 	{APOPFL, ynone, P32, opBytes{0x9d}},
 	{APOPFQ, ynone, Py, opBytes{0x9d}},
 	{APOPFW, ynone, Pe, opBytes{0x9d}},
 	{APOPL, ypopl, P32, opBytes{0x58, 0x8f, 00}},
 	{APOPQ, ypopl, Py, opBytes{0x58, 0x8f, 00}},
 	{APOPW, ypopl, Pe, opBytes{0x58, 0x8f, 00}},
 	{APOR, ymm, Py1, opBytes{0xeb, Pe, 0xeb}},
 	{APSADBW, yxm, Pq, opBytes{0xf6}},
 	{APSHUFHW, yxshuf, Pf3, opBytes{0x70, 00}},
 	{APSHUFL, yxshuf, Pq, opBytes{0x70, 00}},
 	{APSHUFLW, yxshuf, Pf2, opBytes{0x70, 00}},
 	{APSHUFW, ymshuf, Pm, opBytes{0x70, 00}},
 	{APSHUFB, ymshufb, Pq, opBytes{0x38, 0x00}},
 	{APSIGNB, yxm_q4, Pq4, opBytes{0x08}},
 	{APSIGND, yxm_q4, Pq4, opBytes{0x0a}},
 	{APSIGNW, yxm_q4, Pq4, opBytes{0x09}},
 	{APSLLO, ypsdq, Pq, opBytes{0x73, 07}},
 	{APSLLL, yps, Py3, opBytes{0xf2, 0x72, 06, Pe, 0xf2, Pe, 0x72, 06}},
 	{APSLLQ, yps, Py3, opBytes{0xf3, 0x73, 06, Pe, 0xf3, Pe, 0x73, 06}},
 	{APSLLW, yps, Py3, opBytes{0xf1, 0x71, 06, Pe, 0xf1, Pe, 0x71, 06}},
 	{APSRAL, yps, Py3, opBytes{0xe2, 0x72, 04, Pe, 0xe2, Pe, 0x72, 04}},
 	{APSRAW, yps, Py3, opBytes{0xe1, 0x71, 04, Pe, 0xe1, Pe, 0x71, 04}},
 	{APSRLO, ypsdq, Pq, opBytes{0x73, 03}},
 	{APSRLL, yps, Py3, opBytes{0xd2, 0x72, 02, Pe, 0xd2, Pe, 0x72, 02}},
 	{APSRLQ, yps, Py3, opBytes{0xd3, 0x73, 02, Pe, 0xd3, Pe, 0x73, 02}},
 	{APSRLW, yps, Py3, opBytes{0xd1, 0x71, 02, Pe, 0xd1, Pe, 0x71, 02}},
 	{APSUBB, yxm, Pe, opBytes{0xf8}},
 	{APSUBL, yxm, Pe, opBytes{0xfa}},
 	{APSUBQ, yxm, Pe, opBytes{0xfb}},
 	{APSUBSB, yxm, Pe, opBytes{0xe8}},
 	{APSUBSW, yxm, Pe, opBytes{0xe9}},
 	{APSUBUSB, yxm, Pe, opBytes{0xd8}},
 	{APSUBUSW, yxm, Pe, opBytes{0xd9}},
 	{APSUBW, yxm, Pe, opBytes{0xf9}},
 	{APTEST, yxm_q4, Pq4, opBytes{0x17}},
 	{APUNPCKHBW, ymm, Py1, opBytes{0x68, Pe, 0x68}},
 	{APUNPCKHLQ, ymm, Py1, opBytes{0x6a, Pe, 0x6a}},
 	{APUNPCKHQDQ, yxm, Pe, opBytes{0x6d}},
 	{APUNPCKHWL, ymm, Py1, opBytes{0x69, Pe, 0x69}},
 	{APUNPCKLBW, ymm, Py1, opBytes{0x60, Pe, 0x60}},
 	{APUNPCKLLQ, ymm, Py1, opBytes{0x62, Pe, 0x62}},
 	{APUNPCKLQDQ, yxm, Pe, opBytes{0x6c}},
 	{APUNPCKLWL, ymm, Py1, opBytes{0x61, Pe, 0x61}},
 	{APUSHAL, ynone, P32, opBytes{0x60}},
 	{APUSHAW, ynone, Pe, opBytes{0x60}},
 	{APUSHFL, ynone, P32, opBytes{0x9c}},
 	{APUSHFQ, ynone, Py, opBytes{0x9c}},
 	{APUSHFW, ynone, Pe, opBytes{0x9c}},
 	{APUSHL, ypushl, P32, opBytes{0x50, 0xff, 06, 0x6a, 0x68}},
 	{APUSHQ, ypushl, Py, opBytes{0x50, 0xff, 06, 0x6a, 0x68}},
 	{APUSHW, ypushl, Pe, opBytes{0x50, 0xff, 06, 0x6a, 0x68}},
 	{APXOR, ymm, Py1, opBytes{0xef, Pe, 0xef}},
 	{AQUAD, ybyte, Px, opBytes{8}},
 	{ARCLB, yshb, Pb, opBytes{0xd0, 02, 0xc0, 02, 0xd2, 02}},
 	{ARCLL, yshl, Px, opBytes{0xd1, 02, 0xc1, 02, 0xd3, 02, 0xd3, 02}},
 	{ARCLQ, yshl, Pw, opBytes{0xd1, 02, 0xc1, 02, 0xd3, 02, 0xd3, 02}},
 	{ARCLW, yshl, Pe, opBytes{0xd1, 02, 0xc1, 02, 0xd3, 02, 0xd3, 02}},
 	{ARCPPS, yxm, Pm, opBytes{0x53}},
 	{ARCPSS, yxm, Pf3, opBytes{0x53}},
 	{ARCRB, yshb, Pb, opBytes{0xd0, 03, 0xc0, 03, 0xd2, 03}},
 	{ARCRL, yshl, Px, opBytes{0xd1, 03, 0xc1, 03, 0xd3, 03, 0xd3, 03}},
 	{ARCRQ, yshl, Pw, opBytes{0xd1, 03, 0xc1, 03, 0xd3, 03, 0xd3, 03}},
 	{ARCRW, yshl, Pe, opBytes{0xd1, 03, 0xc1, 03, 0xd3, 03, 0xd3, 03}},
 	{AREP, ynone, Px, opBytes{0xf3}},
 	{AREPN, ynone, Px, opBytes{0xf2}},
 	{obj.ARET, ynone, Px, opBytes{0xc3}},
 	{ARETFW, yret, Pe, opBytes{0xcb, 0xca}},
 	{ARETFL, yret, Px, opBytes{0xcb, 0xca}},
 	{ARETFQ, yret, Pw, opBytes{0xcb, 0xca}},
 	{AROLB, yshb, Pb, opBytes{0xd0, 00, 0xc0, 00, 0xd2, 00}},
 	{AROLL, yshl, Px, opBytes{0xd1, 00, 0xc1, 00, 0xd3, 00, 0xd3, 00}},
 	{AROLQ, yshl, Pw, opBytes{0xd1, 00, 0xc1, 00, 0xd3, 00, 0xd3, 00}},
 	{AROLW, yshl, Pe, opBytes{0xd1, 00, 0xc1, 00, 0xd3, 00, 0xd3, 00}},
 	{ARORB, yshb, Pb, opBytes{0xd0, 01, 0xc0, 01, 0xd2, 01}},
 	{ARORL, yshl, Px, opBytes{0xd1, 01, 0xc1, 01, 0xd3, 01, 0xd3, 01}},
 	{ARORQ, yshl, Pw, opBytes{0xd1, 01, 0xc1, 01, 0xd3, 01, 0xd3, 01}},
 	{ARORW, yshl, Pe, opBytes{0xd1, 01, 0xc1, 01, 0xd3, 01, 0xd3, 01}},
 	{ARSQRTPS, yxm, Pm, opBytes{0x52}},
 	{ARSQRTSS, yxm, Pf3, opBytes{0x52}},
 	{ASAHF, ynone, Px, opBytes{0x9e, 00, 0x86, 0xe0, 0x50, 0x9d}}, // XCHGB AH,AL; PUSH AX; POPFL
 	{ASALB, yshb, Pb, opBytes{0xd0, 04, 0xc0, 04, 0xd2, 04}},
 	{ASALL, yshl, Px, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
 	{ASALQ, yshl, Pw, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
 	{ASALW, yshl, Pe, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
 	{ASARB, yshb, Pb, opBytes{0xd0, 07, 0xc0, 07, 0xd2, 07}},
 	{ASARL, yshl, Px, opBytes{0xd1, 07, 0xc1, 07, 0xd3, 07, 0xd3, 07}},
 	{ASARQ, yshl, Pw, opBytes{0xd1, 07, 0xc1, 07, 0xd3, 07, 0xd3, 07}},
 	{ASARW, yshl, Pe, opBytes{0xd1, 07, 0xc1, 07, 0xd3, 07, 0xd3, 07}},
 	{ASBBB, yxorb, Pb, opBytes{0x1c, 0x80, 03, 0x18, 0x1a}},
 	{ASBBL, yaddl, Px, opBytes{0x83, 03, 0x1d, 0x81, 03, 0x19, 0x1b}},
 	{ASBBQ, yaddl, Pw, opBytes{0x83, 03, 0x1d, 0x81, 03, 0x19, 0x1b}},
 	{ASBBW, yaddl, Pe, opBytes{0x83, 03, 0x1d, 0x81, 03, 0x19, 0x1b}},
 	{ASCASB, ynone, Pb, opBytes{0xae}},
 	{ASCASL, ynone, Px, opBytes{0xaf}},
 	{ASCASQ, ynone, Pw, opBytes{0xaf}},
 	{ASCASW, ynone, Pe, opBytes{0xaf}},
 	{ASETCC, yscond, Pb, opBytes{0x0f, 0x93, 00}},
 	{ASETCS, yscond, Pb, opBytes{0x0f, 0x92, 00}},
 	{ASETEQ, yscond, Pb, opBytes{0x0f, 0x94, 00}},
 	{ASETGE, yscond, Pb, opBytes{0x0f, 0x9d, 00}},
 	{ASETGT, yscond, Pb, opBytes{0x0f, 0x9f, 00}},
 	{ASETHI, yscond, Pb, opBytes{0x0f, 0x97, 00}},
 	{ASETLE, yscond, Pb, opBytes{0x0f, 0x9e, 00}},
 	{ASETLS, yscond, Pb, opBytes{0x0f, 0x96, 00}},
 	{ASETLT, yscond, Pb, opBytes{0x0f, 0x9c, 00}},
 	{ASETMI, yscond, Pb, opBytes{0x0f, 0x98, 00}},
 	{ASETNE, yscond, Pb, opBytes{0x0f, 0x95, 00}},
 	{ASETOC, yscond, Pb, opBytes{0x0f, 0x91, 00}},
 	{ASETOS, yscond, Pb, opBytes{0x0f, 0x90, 00}},
 	{ASETPC, yscond, Pb, opBytes{0x0f, 0x9b, 00}},
 	{ASETPL, yscond, Pb, opBytes{0x0f, 0x99, 00}},
 	{ASETPS, yscond, Pb, opBytes{0x0f, 0x9a, 00}},
 	{ASHLB, yshb, Pb, opBytes{0xd0, 04, 0xc0, 04, 0xd2, 04}},
 	{ASHLL, yshl, Px, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
 	{ASHLQ, yshl, Pw, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
 	{ASHLW, yshl, Pe, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
 	{ASHRB, yshb, Pb, opBytes{0xd0, 05, 0xc0, 05, 0xd2, 05}},
 	{ASHRL, yshl, Px, opBytes{0xd1, 05, 0xc1, 05, 0xd3, 05, 0xd3, 05}},
 	{ASHRQ, yshl, Pw, opBytes{0xd1, 05, 0xc1, 05, 0xd3, 05, 0xd3, 05}},
 	{ASHRW, yshl, Pe, opBytes{0xd1, 05, 0xc1, 05, 0xd3, 05, 0xd3, 05}},
 	{ASHUFPD, yxshuf, Pq, opBytes{0xc6, 00}},
 	{ASHUFPS, yxshuf, Pm, opBytes{0xc6, 00}},
 	{ASQRTPD, yxm, Pe, opBytes{0x51}},
 	{ASQRTPS, yxm, Pm, opBytes{0x51}},
 	{ASQRTSD, yxm, Pf2, opBytes{0x51}},
 	{ASQRTSS, yxm, Pf3, opBytes{0x51}},
 	{ASTC, ynone, Px, opBytes{0xf9}},
 	{ASTD, ynone, Px, opBytes{0xfd}},
 	{ASTI, ynone, Px, opBytes{0xfb}},
 	{ASTMXCSR, ysvrs_om, Pm, opBytes{0xae, 03, 0xae, 03}},
 	{ASTOSB, ynone, Pb, opBytes{0xaa}},
 	{ASTOSL, ynone, Px, opBytes{0xab}},
 	{ASTOSQ, ynone, Pw, opBytes{0xab}},
 	{ASTOSW, ynone, Pe, opBytes{0xab}},
 	{ASUBB, yxorb, Pb, opBytes{0x2c, 0x80, 05, 0x28, 0x2a}},
 	{ASUBL, yaddl, Px, opBytes{0x83, 05, 0x2d, 0x81, 05, 0x29, 0x2b}},
 	{ASUBPD, yxm, Pe, opBytes{0x5c}},
 	{ASUBPS, yxm, Pm, opBytes{0x5c}},
 	{ASUBQ, yaddl, Pw, opBytes{0x83, 05, 0x2d, 0x81, 05, 0x29, 0x2b}},
 	{ASUBSD, yxm, Pf2, opBytes{0x5c}},
 	{ASUBSS, yxm, Pf3, opBytes{0x5c}},
 	{ASUBW, yaddl, Pe, opBytes{0x83, 05, 0x2d, 0x81, 05, 0x29, 0x2b}},
 	{ASWAPGS, ynone, Pm, opBytes{0x01, 0xf8}},
 	{ASYSCALL, ynone, Px, opBytes{0x0f, 0x05}}, // fast syscall
 	{ATESTB, yxorb, Pb, opBytes{0xa8, 0xf6, 00, 0x84, 0x84}},
 	{ATESTL, ytestl, Px, opBytes{0xa9, 0xf7, 00, 0x85, 0x85}},
 	{ATESTQ, ytestl, Pw, opBytes{0xa9, 0xf7, 00, 0x85, 0x85}},
 	{ATESTW, ytestl, Pe, opBytes{0xa9, 0xf7, 00, 0x85, 0x85}},
 	{ATPAUSE, ywrfsbase, Pq, opBytes{0xae, 06}},
 	{obj.ATEXT, ytext, Px, opBytes{}},
 	{AUCOMISD, yxm, Pe, opBytes{0x2e}},
 	{AUCOMISS, yxm, Pm, opBytes{0x2e}},
 	{AUNPCKHPD, yxm, Pe, opBytes{0x15}},
 	{AUNPCKHPS, yxm, Pm, opBytes{0x15}},
 	{AUNPCKLPD, yxm, Pe, opBytes{0x14}},
 	{AUNPCKLPS, yxm, Pm, opBytes{0x14}},
 	{AUMONITOR, ywrfsbase, Pf3, opBytes{0xae, 06}},
 	{AVERR, ydivl, Pm, opBytes{0x00, 04}},
 	{AVERW, ydivl, Pm, opBytes{0x00, 05}},
 	{AWAIT, ynone, Px, opBytes{0x9b}},
 	{AWORD, ybyte, Px, opBytes{2}},
 	{AXCHGB, yml_mb, Pb, opBytes{0x86, 0x86}},
 	{AXCHGL, yxchg, Px, opBytes{0x90, 0x90, 0x87, 0x87}},
 	{AXCHGQ, yxchg, Pw, opBytes{0x90, 0x90, 0x87, 0x87}},
 	{AXCHGW, yxchg, Pe, opBytes{0x90, 0x90, 0x87, 0x87}},
 	{AXLAT, ynone, Px, opBytes{0xd7}},
 	{AXORB, yxorb, Pb, opBytes{0x34, 0x80, 06, 0x30, 0x32}},
 	{AXORL, yaddl, Px, opBytes{0x83, 06, 0x35, 0x81, 06, 0x31, 0x33}},
 	{AXORPD, yxm, Pe, opBytes{0x57}},
 	{AXORPS, yxm, Pm, opBytes{0x57}},
 	{AXORQ, yaddl, Pw, opBytes{0x83, 06, 0x35, 0x81, 06, 0x31, 0x33}},
 	{AXORW, yaddl, Pe, opBytes{0x83, 06, 0x35, 0x81, 06, 0x31, 0x33}},
 	{AFMOVB, yfmvx, Px, opBytes{0xdf, 04}},
 	{AFMOVBP, yfmvp, Px, opBytes{0xdf, 06}},
 	{AFMOVD, yfmvd, Px, opBytes{0xdd, 00, 0xdd, 02, 0xd9, 00, 0xdd, 02}},
 	{AFMOVDP, yfmvdp, Px, opBytes{0xdd, 03, 0xdd, 03}},
 	{AFMOVF, yfmvf, Px, opBytes{0xd9, 00, 0xd9, 02}},
 	{AFMOVFP, yfmvp, Px, opBytes{0xd9, 03}},
 	{AFMOVL, yfmvf, Px, opBytes{0xdb, 00, 0xdb, 02}},
 	{AFMOVLP, yfmvp, Px, opBytes{0xdb, 03}},
 	{AFMOVV, yfmvx, Px, opBytes{0xdf, 05}},
 	{AFMOVVP, yfmvp, Px, opBytes{0xdf, 07}},
 	{AFMOVW, yfmvf, Px, opBytes{0xdf, 00, 0xdf, 02}},
 	{AFMOVWP, yfmvp, Px, opBytes{0xdf, 03}},
 	{AFMOVX, yfmvx, Px, opBytes{0xdb, 05}},
 	{AFMOVXP, yfmvp, Px, opBytes{0xdb, 07}},
 	{AFCMOVCC, yfcmv, Px, opBytes{0xdb, 00}},
 	{AFCMOVCS, yfcmv, Px, opBytes{0xda, 00}},
 	{AFCMOVEQ, yfcmv, Px, opBytes{0xda, 01}},
 	{AFCMOVHI, yfcmv, Px, opBytes{0xdb, 02}},
 	{AFCMOVLS, yfcmv, Px, opBytes{0xda, 02}},
 	{AFCMOVB, yfcmv, Px, opBytes{0xda, 00}},
 	{AFCMOVBE, yfcmv, Px, opBytes{0xda, 02}},
 	{AFCMOVNB, yfcmv, Px, opBytes{0xdb, 00}},
 	{AFCMOVNBE, yfcmv, Px, opBytes{0xdb, 02}},
 	{AFCMOVE, yfcmv, Px, opBytes{0xda, 01}},
 	{AFCMOVNE, yfcmv, Px, opBytes{0xdb, 01}},
 	{AFCMOVNU, yfcmv, Px, opBytes{0xdb, 03}},
 	{AFCMOVU, yfcmv, Px, opBytes{0xda, 03}},
 	{AFCMOVUN, yfcmv, Px, opBytes{0xda, 03}},
 	{AFCOMD, yfadd, Px, opBytes{0xdc, 02, 0xd8, 02, 0xdc, 02}},  // botch
 	{AFCOMDP, yfadd, Px, opBytes{0xdc, 03, 0xd8, 03, 0xdc, 03}}, // botch
 	{AFCOMDPP, ycompp, Px, opBytes{0xde, 03}},
 	{AFCOMF, yfmvx, Px, opBytes{0xd8, 02}},
 	{AFCOMFP, yfmvx, Px, opBytes{0xd8, 03}},
 	{AFCOMI, yfcmv, Px, opBytes{0xdb, 06}},
 	{AFCOMIP, yfcmv, Px, opBytes{0xdf, 06}},
 	{AFCOML, yfmvx, Px, opBytes{0xda, 02}},
 	{AFCOMLP, yfmvx, Px, opBytes{0xda, 03}},
 	{AFCOMW, yfmvx, Px, opBytes{0xde, 02}},
 	{AFCOMWP, yfmvx, Px, opBytes{0xde, 03}},
 	{AFUCOM, ycompp, Px, opBytes{0xdd, 04}},
 	{AFUCOMI, ycompp, Px, opBytes{0xdb, 05}},
 	{AFUCOMIP, ycompp, Px, opBytes{0xdf, 05}},
 	{AFUCOMP, ycompp, Px, opBytes{0xdd, 05}},
 	{AFUCOMPP, ycompp, Px, opBytes{0xda, 13}},
 	{AFADDDP, ycompp, Px, opBytes{0xde, 00}},
 	{AFADDW, yfmvx, Px, opBytes{0xde, 00}},
 	{AFADDL, yfmvx, Px, opBytes{0xda, 00}},
 	{AFADDF, yfmvx, Px, opBytes{0xd8, 00}},
 	{AFADDD, yfadd, Px, opBytes{0xdc, 00, 0xd8, 00, 0xdc, 00}},
 	{AFMULDP, ycompp, Px, opBytes{0xde, 01}},
 	{AFMULW, yfmvx, Px, opBytes{0xde, 01}},
 	{AFMULL, yfmvx, Px, opBytes{0xda, 01}},
 	{AFMULF, yfmvx, Px, opBytes{0xd8, 01}},
 	{AFMULD, yfadd, Px, opBytes{0xdc, 01, 0xd8, 01, 0xdc, 01}},
 	{AFSUBDP, ycompp, Px, opBytes{0xde, 05}},
 	{AFSUBW, yfmvx, Px, opBytes{0xde, 04}},
 	{AFSUBL, yfmvx, Px, opBytes{0xda, 04}},
 	{AFSUBF, yfmvx, Px, opBytes{0xd8, 04}},
 	{AFSUBD, yfadd, Px, opBytes{0xdc, 04, 0xd8, 04, 0xdc, 05}},
 	{AFSUBRDP, ycompp, Px, opBytes{0xde, 04}},
 	{AFSUBRW, yfmvx, Px, opBytes{0xde, 05}},
 	{AFSUBRL, yfmvx, Px, opBytes{0xda, 05}},
 	{AFSUBRF, yfmvx, Px, opBytes{0xd8, 05}},
 	{AFSUBRD, yfadd, Px, opBytes{0xdc, 05, 0xd8, 05, 0xdc, 04}},
 	{AFDIVDP, ycompp, Px, opBytes{0xde, 07}},
 	{AFDIVW, yfmvx, Px, opBytes{0xde, 06}},
 	{AFDIVL, yfmvx, Px, opBytes{0xda, 06}},
 	{AFDIVF, yfmvx, Px, opBytes{0xd8, 06}},
 	{AFDIVD, yfadd, Px, opBytes{0xdc, 06, 0xd8, 06, 0xdc, 07}},
 	{AFDIVRDP, ycompp, Px, opBytes{0xde, 06}},
 	{AFDIVRW, yfmvx, Px, opBytes{0xde, 07}},
 	{AFDIVRL, yfmvx, Px, opBytes{0xda, 07}},
 	{AFDIVRF, yfmvx, Px, opBytes{0xd8, 07}},
 	{AFDIVRD, yfadd, Px, opBytes{0xdc, 07, 0xd8, 07, 0xdc, 06}},
 	{AFXCHD, yfxch, Px, opBytes{0xd9, 01, 0xd9, 01}},
 	{AFFREE, nil, 0, opBytes{}},
 	{AFLDCW, ysvrs_mo, Px, opBytes{0xd9, 05, 0xd9, 05}},
 	{AFLDENV, ysvrs_mo, Px, opBytes{0xd9, 04, 0xd9, 04}},
 	{AFRSTOR, ysvrs_mo, Px, opBytes{0xdd, 04, 0xdd, 04}},
 	{AFSAVE, ysvrs_om, Px, opBytes{0xdd, 06, 0xdd, 06}},
 	{AFSTCW, ysvrs_om, Px, opBytes{0xd9, 07, 0xd9, 07}},
 	{AFSTENV, ysvrs_om, Px, opBytes{0xd9, 06, 0xd9, 06}},
 	{AFSTSW, ystsw, Px, opBytes{0xdd, 07, 0xdf, 0xe0}},
 	{AF2XM1, ynone, Px, opBytes{0xd9, 0xf0}},
 	{AFABS, ynone, Px, opBytes{0xd9, 0xe1}},
 	{AFBLD, ysvrs_mo, Px, opBytes{0xdf, 04}},
 	{AFBSTP, yclflush, Px, opBytes{0xdf, 06}},
 	{AFCHS, ynone, Px, opBytes{0xd9, 0xe0}},
 	{AFCLEX, ynone, Px, opBytes{0xdb, 0xe2}},
 	{AFCOS, ynone, Px, opBytes{0xd9, 0xff}},
 	{AFDECSTP, ynone, Px, opBytes{0xd9, 0xf6}},
 	{AFINCSTP, ynone, Px, opBytes{0xd9, 0xf7}},
 	{AFINIT, ynone, Px, opBytes{0xdb, 0xe3}},
 	{AFLD1, ynone, Px, opBytes{0xd9, 0xe8}},
 	{AFLDL2E, ynone, Px, opBytes{0xd9, 0xea}},
 	{AFLDL2T, ynone, Px, opBytes{0xd9, 0xe9}},
 	{AFLDLG2, ynone, Px, opBytes{0xd9, 0xec}},
 	{AFLDLN2, ynone, Px, opBytes{0xd9, 0xed}},
 	{AFLDPI, ynone, Px, opBytes{0xd9, 0xeb}},
 	{AFLDZ, ynone, Px, opBytes{0xd9, 0xee}},
 	{AFNOP, ynone, Px, opBytes{0xd9, 0xd0}},
 	{AFPATAN, ynone, Px, opBytes{0xd9, 0xf3}},
 	{AFPREM, ynone, Px, opBytes{0xd9, 0xf8}},
 	{AFPREM1, ynone, Px, opBytes{0xd9, 0xf5}},
 	{AFPTAN, ynone, Px, opBytes{0xd9, 0xf2}},
 	{AFRNDINT, ynone, Px, opBytes{0xd9, 0xfc}},
 	{AFSCALE, ynone, Px, opBytes{0xd9, 0xfd}},
 	{AFSIN, ynone, Px, opBytes{0xd9, 0xfe}},
 	{AFSINCOS, ynone, Px, opBytes{0xd9, 0xfb}},
 	{AFSQRT, ynone, Px, opBytes{0xd9, 0xfa}},
 	{AFTST, ynone, Px, opBytes{0xd9, 0xe4}},
 	{AFXAM, ynone, Px, opBytes{0xd9, 0xe5}},
 	{AFXTRACT, ynone, Px, opBytes{0xd9, 0xf4}},
 	{AFYL2X, ynone, Px, opBytes{0xd9, 0xf1}},
 	{AFYL2XP1, ynone, Px, opBytes{0xd9, 0xf9}},
 	{ACMPXCHGB, yrb_mb, Pb, opBytes{0x0f, 0xb0}},
 	{ACMPXCHGL, yrl_ml, Px, opBytes{0x0f, 0xb1}},
 	{ACMPXCHGW, yrl_ml, Pe, opBytes{0x0f, 0xb1}},
 	{ACMPXCHGQ, yrl_ml, Pw, opBytes{0x0f, 0xb1}},
 	{ACMPXCHG8B, yscond, Pm, opBytes{0xc7, 01}},
 	{ACMPXCHG16B, yscond, Pw, opBytes{0x0f, 0xc7, 01}},
 	{AINVD, ynone, Pm, opBytes{0x08}},
 	{AINVLPG, ydivb, Pm, opBytes{0x01, 07}},
 	{AINVPCID, ycrc32l, Pe, opBytes{0x0f, 0x38, 0x82, 0}},
 	{ALFENCE, ynone, Pm, opBytes{0xae, 0xe8}},
 	{AMFENCE, ynone, Pm, opBytes{0xae, 0xf0}},
 	{AMOVNTIL, yrl_ml, Pm, opBytes{0xc3}},
 	{AMOVNTIQ, yrl_ml, Pw, opBytes{0x0f, 0xc3}},
 	{ARDPKRU, ynone, Pm, opBytes{0x01, 0xee, 0}},
 	{ARDMSR, ynone, Pm, opBytes{0x32}},
 	{ARDPMC, ynone, Pm, opBytes{0x33}},
 	{ARDTSC, ynone, Pm, opBytes{0x31}},
 	{ARSM, ynone, Pm, opBytes{0xaa}},
 	{ASFENCE, ynone, Pm, opBytes{0xae, 0xf8}},
 	{ASYSRET, ynone, Pm, opBytes{0x07}},
 	{AWBINVD, ynone, Pm, opBytes{0x09}},
 	{AWRMSR, ynone, Pm, opBytes{0x30}},
 	{AWRPKRU, ynone, Pm, opBytes{0x01, 0xef, 0}},
 	{AXADDB, yrb_mb, Pb, opBytes{0x0f, 0xc0}},
 	{AXADDL, yrl_ml, Px, opBytes{0x0f, 0xc1}},
 	{AXADDQ, yrl_ml, Pw, opBytes{0x0f, 0xc1}},
 	{AXADDW, yrl_ml, Pe, opBytes{0x0f, 0xc1}},
 	{ACRC32B, ycrc32b, Px, opBytes{0xf2, 0x0f, 0x38, 0xf0, 0}},
 	{ACRC32L, ycrc32l, Px, opBytes{0xf2, 0x0f, 0x38, 0xf1, 0}},
 	{ACRC32Q, ycrc32l, Pw, opBytes{0xf2, 0x0f, 0x38, 0xf1, 0}},
 	{ACRC32W, ycrc32l, Pe, opBytes{0xf2, 0x0f, 0x38, 0xf1, 0}},
 	{APREFETCHT0, yprefetch, Pm, opBytes{0x18, 01}},
 	{APREFETCHT1, yprefetch, Pm, opBytes{0x18, 02}},
 	{APREFETCHT2, yprefetch, Pm, opBytes{0x18, 03}},
 	{APREFETCHNTA, yprefetch, Pm, opBytes{0x18, 00}},
 	{AMOVQL, yrl_ml, Px, opBytes{0x89}},
 	{obj.AUNDEF, ynone, Px, opBytes{0x0f, 0x0b}},
 	{AAESENC, yaes, Pq, opBytes{0x38, 0xdc, 0}},
 	{AAESENCLAST, yaes, Pq, opBytes{0x38, 0xdd, 0}},
 	{AAESDEC, yaes, Pq, opBytes{0x38, 0xde, 0}},
 	{AAESDECLAST, yaes, Pq, opBytes{0x38, 0xdf, 0}},
 	{AAESIMC, yaes, Pq, opBytes{0x38, 0xdb, 0}},
 	{AAESKEYGENASSIST, yxshuf, Pq, opBytes{0x3a, 0xdf, 0}},
 	{AROUNDPD, yxshuf, Pq, opBytes{0x3a, 0x09, 0}},
 	{AROUNDPS, yxshuf, Pq, opBytes{0x3a, 0x08, 0}},
 	{AROUNDSD, yxshuf, Pq, opBytes{0x3a, 0x0b, 0}},
 	{AROUNDSS, yxshuf, Pq, opBytes{0x3a, 0x0a, 0}},
 	{APSHUFD, yxshuf, Pq, opBytes{0x70, 0}},
 	{APCLMULQDQ, yxshuf, Pq, opBytes{0x3a, 0x44, 0}},
 	{APCMPESTRI, yxshuf, Pq, opBytes{0x3a, 0x61, 0}},
 	{APCMPESTRM, yxshuf, Pq, opBytes{0x3a, 0x60, 0}},
 	{AMOVDDUP, yxm, Pf2, opBytes{0x12}},
 	{AMOVSHDUP, yxm, Pf3, opBytes{0x16}},
 	{AMOVSLDUP, yxm, Pf3, opBytes{0x12}},
 	{ARDTSCP, ynone, Pm, opBytes{0x01, 0xf9, 0}},
 	{ASTAC, ynone, Pm, opBytes{0x01, 0xcb, 0}},
 	{AUD1, ynone, Pm, opBytes{0xb9, 0}},
 	{AUD2, ynone, Pm, opBytes{0x0b, 0}},
 	{AUMWAIT, ywrfsbase, Pf2, opBytes{0xae, 06}},
 	{ASYSENTER, ynone, Px, opBytes{0x0f, 0x34, 0}},
 	{ASYSENTER64, ynone, Pw, opBytes{0x0f, 0x34, 0}},
 	{ASYSEXIT, ynone, Px, opBytes{0x0f, 0x35, 0}},
 	{ASYSEXIT64, ynone, Pw, opBytes{0x0f, 0x35, 0}},
 	{ALMSW, ydivl, Pm, opBytes{0x01, 06}},
 	{ALLDT, ydivl, Pm, opBytes{0x00, 02}},
 	{ALIDT, ysvrs_mo, Pm, opBytes{0x01, 03}},
 	{ALGDT, ysvrs_mo, Pm, opBytes{0x01, 02}},
 	{ATZCNTW, ycrc32l, Pe, opBytes{0xf3, 0x0f, 0xbc, 0}},
 	{ATZCNTL, ycrc32l, Px, opBytes{0xf3, 0x0f, 0xbc, 0}},
 	{ATZCNTQ, ycrc32l, Pw, opBytes{0xf3, 0x0f, 0xbc, 0}},
 	{AXRSTOR, ydivl, Px, opBytes{0x0f, 0xae, 05}},
 	{AXRSTOR64, ydivl, Pw, opBytes{0x0f, 0xae, 05}},
 	{AXRSTORS, ydivl, Px, opBytes{0x0f, 0xc7, 03}},
 	{AXRSTORS64, ydivl, Pw, opBytes{0x0f, 0xc7, 03}},
 	{AXSAVE, yclflush, Px, opBytes{0x0f, 0xae, 04}},
 	{AXSAVE64, yclflush, Pw, opBytes{0x0f, 0xae, 04}},
 	{AXSAVEOPT, yclflush, Px, opBytes{0x0f, 0xae, 06}},
 	{AXSAVEOPT64, yclflush, Pw, opBytes{0x0f, 0xae, 06}},
 	{AXSAVEC, yclflush, Px, opBytes{0x0f, 0xc7, 04}},
 	{AXSAVEC64, yclflush, Pw, opBytes{0x0f, 0xc7, 04}},
 	{AXSAVES, yclflush, Px, opBytes{0x0f, 0xc7, 05}},
 	{AXSAVES64, yclflush, Pw, opBytes{0x0f, 0xc7, 05}},
 	{ASGDT, yclflush, Pm, opBytes{0x01, 00}},
 	{ASIDT, yclflush, Pm, opBytes{0x01, 01}},
 	{ARDRANDW, yrdrand, Pe, opBytes{0x0f, 0xc7, 06}},
 	{ARDRANDL, yrdrand, Px, opBytes{0x0f, 0xc7, 06}},
 	{ARDRANDQ, yrdrand, Pw, opBytes{0x0f, 0xc7, 06}},
 	{ARDSEEDW, yrdrand, Pe, opBytes{0x0f, 0xc7, 07}},
 	{ARDSEEDL, yrdrand, Px, opBytes{0x0f, 0xc7, 07}},
 	{ARDSEEDQ, yrdrand, Pw, opBytes{0x0f, 0xc7, 07}},
 	{ASTRW, yincq, Pe, opBytes{0x0f, 0x00, 01}},
 	{ASTRL, yincq, Px, opBytes{0x0f, 0x00, 01}},
 	{ASTRQ, yincq, Pw, opBytes{0x0f, 0x00, 01}},
 	{AXSETBV, ynone, Pm, opBytes{0x01, 0xd1, 0}},
 	{AMOVBEW, ymovbe, Pq, opBytes{0x38, 0xf0, 0, 0x38, 0xf1, 0}},
 	{AMOVBEL, ymovbe, Pm, opBytes{0x38, 0xf0, 0, 0x38, 0xf1, 0}},
 	{AMOVBEQ, ymovbe, Pw, opBytes{0x0f, 0x38, 0xf0, 0, 0x0f, 0x38, 0xf1, 0}},
 	{ANOPW, ydivl, Pe, opBytes{0x0f, 0x1f, 00}},
 	{ANOPL, ydivl, Px, opBytes{0x0f, 0x1f, 00}},
 	{ASLDTW, yincq, Pe, opBytes{0x0f, 0x00, 00}},
 	{ASLDTL, yincq, Px, opBytes{0x0f, 0x00, 00}},
 	{ASLDTQ, yincq, Pw, opBytes{0x0f, 0x00, 00}},
 	{ASMSWW, yincq, Pe, opBytes{0x0f, 0x01, 04}},
 	{ASMSWL, yincq, Px, opBytes{0x0f, 0x01, 04}},
 	{ASMSWQ, yincq, Pw, opBytes{0x0f, 0x01, 04}},
 	{ABLENDVPS, yblendvpd, Pq4, opBytes{0x14}},
 	{ABLENDVPD, yblendvpd, Pq4, opBytes{0x15}},
 	{APBLENDVB, yblendvpd, Pq4, opBytes{0x10}},
 	{ASHA1MSG1, yaes, Px, opBytes{0x0f, 0x38, 0xc9, 0}},
 	{ASHA1MSG2, yaes, Px, opBytes{0x0f, 0x38, 0xca, 0}},
 	{ASHA1NEXTE, yaes, Px, opBytes{0x0f, 0x38, 0xc8, 0}},
 	{ASHA256MSG1, yaes, Px, opBytes{0x0f, 0x38, 0xcc, 0}},
 	{ASHA256MSG2, yaes, Px, opBytes{0x0f, 0x38, 0xcd, 0}},
 	{ASHA1RNDS4, ysha1rnds4, Pm, opBytes{0x3a, 0xcc, 0}},
 	{ASHA256RNDS2, ysha256rnds2, Px, opBytes{0x0f, 0x38, 0xcb, 0}},
 	{ARDFSBASEL, yrdrand, Pf3, opBytes{0xae, 00}},
 	{ARDFSBASEQ, yrdrand, Pfw, opBytes{0xae, 00}},
 	{ARDGSBASEL, yrdrand, Pf3, opBytes{0xae, 01}},
 	{ARDGSBASEQ, yrdrand, Pfw, opBytes{0xae, 01}},
 	{AWRFSBASEL, ywrfsbase, Pf3, opBytes{0xae, 02}},
 	{AWRFSBASEQ, ywrfsbase, Pfw, opBytes{0xae, 02}},
 	{AWRGSBASEL, ywrfsbase, Pf3, opBytes{0xae, 03}},
 	{AWRGSBASEQ, ywrfsbase, Pfw, opBytes{0xae, 03}},
 	{ALFSW, ym_rl, Pe, opBytes{0x0f, 0xb4}},
 	{ALFSL, ym_rl, Px, opBytes{0x0f, 0xb4}},
 	{ALFSQ, ym_rl, Pw, opBytes{0x0f, 0xb4}},
 	{ALGSW, ym_rl, Pe, opBytes{0x0f, 0xb5}},
 	{ALGSL, ym_rl, Px, opBytes{0x0f, 0xb5}},
 	{ALGSQ, ym_rl, Pw, opBytes{0x0f, 0xb5}},
 	{ALSSW, ym_rl, Pe, opBytes{0x0f, 0xb2}},
 	{ALSSL, ym_rl, Px, opBytes{0x0f, 0xb2}},
 	{ALSSQ, ym_rl, Pw, opBytes{0x0f, 0xb2}},

 	{ABLENDPD, yxshuf, Pq, opBytes{0x3a, 0x0d, 0}},
 	{ABLENDPS, yxshuf, Pq, opBytes{0x3a, 0x0c, 0}},
 	{AXACQUIRE, ynone, Px, opBytes{0xf2}},
 	{AXRELEASE, ynone, Px, opBytes{0xf3}},
 	{AXBEGIN, yxbegin, Px, opBytes{0xc7, 0xf8}},
 	{AXABORT, yxabort, Px, opBytes{0xc6, 0xf8}},
 	{AXEND, ynone, Px, opBytes{0x0f, 01, 0xd5}},
 	{AXTEST, ynone, Px, opBytes{0x0f, 01, 0xd6}},
 	{AXGETBV, ynone, Pm, opBytes{01, 0xd0}},
 	{obj.AFUNCDATA, yfuncdata, Px, opBytes{0, 0}},
 	{obj.APCDATA, ypcdata, Px, opBytes{0, 0}},
 	{obj.ADUFFCOPY, yduff, Px, opBytes{0xe8}},
 	{obj.ADUFFZERO, yduff, Px, opBytes{0xe8}},

 	{obj.AEND, nil, 0, opBytes{}},
 	{0, nil, 0, opBytes{}},
 }

 var opindex [(ALAST + 1) & obj.AMask]*Optab

 // useAbs reports whether s describes a symbol that must avoid pc-relative addressing.
 // This happens on systems like Solaris that call .so functions instead of system calls.
 // It does not seem to be necessary for any other systems. This is probably working
 // around a Solaris-specific bug that should be fixed differently, but we don't know
 // what that bug is. And this does fix it.
 func useAbs(ctxt *obj.Link, s *obj.LSym) bool {
 	if ctxt.Headtype == objabi.Hsolaris {
 		// All the Solaris dynamic imports from libc.so begin with "libc_".
 		return strings.HasPrefix(s.Name, "libc_")
 	}
 	return ctxt.Arch.Family == sys.I386 && !ctxt.Flag_shared
 }

 // single-instruction no-ops of various lengths.
 // constructed by hand and disassembled with gdb to verify.
 // see http://www.agner.org/optimize/optimizing_assembly.pdf for discussion.
 var nop = [][16]uint8{
 	{0x90},
 	{0x66, 0x90},
 	{0x0F, 0x1F, 0x00},
 	{0x0F, 0x1F, 0x40, 0x00},
 	{0x0F, 0x1F, 0x44, 0x00, 0x00},
 	{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
 	{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
 	{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
 	{0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
 }

 // Native Client rejects the repeated 0x66 prefix.
 // {0x66, 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
 func fillnop(p []byte, n int) {
 	var m int

 	for n > 0 {
 		m = n
 		if m > len(nop) {
 			m = len(nop)
 		}
 		copy(p[:m], nop[m-1][:m])
 		p = p[m:]
 		n -= m
 	}
 }

 func noppad(ctxt *obj.Link, s *obj.LSym, c int32, pad int32) int32 {
 	s.Grow(int64(c) + int64(pad))
 	fillnop(s.P[c:], int(pad))
 	return c + pad
 }

 func spadjop(ctxt *obj.Link, l, q obj.As) obj.As {
 	if ctxt.Arch.Family != sys.AMD64 || ctxt.Arch.PtrSize == 4 {
 		return l
 	}
 	return q
 }

 // isJump returns whether p is a jump instruction.
 // It is used to ensure that no standalone or macro-fused jump will straddle
 // or end on a 32 byte boundary by inserting NOPs before the jumps.
 func isJump(p *obj.Prog) bool {
 	return p.To.Target() != nil || p.As == obj.AJMP || p.As == obj.ACALL ||
 		p.As == obj.ARET || p.As == obj.ADUFFCOPY || p.As == obj.ADUFFZERO
 }

 // lookForJCC returns the first real instruction starting from p, if that instruction is a conditional
 // jump. Otherwise, nil is returned.
 func lookForJCC(p *obj.Prog) *obj.Prog {
 	// Skip any PCDATA, FUNCDATA or NOP instructions
 	var q *obj.Prog
 	for q = p.Link; q != nil && (q.As == obj.APCDATA || q.As == obj.AFUNCDATA || q.As == obj.ANOP); q = q.Link {
 	}

 	if q == nil || q.To.Target() == nil || p.As == obj.AJMP || p.As == obj.ACALL {
 		return nil
 	}

 	switch q.As {
 	case AJOS, AJOC, AJCS, AJCC, AJEQ, AJNE, AJLS, AJHI,
 		AJMI, AJPL, AJPS, AJPC, AJLT, AJGE, AJLE, AJGT:
 	default:
 		return nil
 	}

 	return q
 }

 // fusedJump determines whether p can be fused with a subsequent conditional jump instruction.
 // If it can, we return true followed by the total size of the fused jump. If it can't, we return false.
 // Macro fusion rules are derived from the Intel Optimization Manual (April 2019) section 3.4.2.2.
 func fusedJump(p *obj.Prog) (bool, uint8) {
 	var fusedSize uint8

 	// The first instruction in a macro fused pair may be preceded by the LOCK prefix,
 	// or possibly an XACQUIRE/XRELEASE prefix followed by a LOCK prefix. If it is, we
 	// need to be careful to insert any padding before the locks rather than directly after them.

 	if p.As == AXRELEASE || p.As == AXACQUIRE {
 		fusedSize += p.Isize
 		for p = p.Link; p != nil && (p.As == obj.APCDATA || p.As == obj.AFUNCDATA); p = p.Link {
 		}
 		if p == nil {
 			return false, 0
 		}
 	}
 	if p.As == ALOCK {
 		fusedSize += p.Isize
 		for p = p.Link; p != nil && (p.As == obj.APCDATA || p.As == obj.AFUNCDATA); p = p.Link {
 		}
 		if p == nil {
 			return false, 0
 		}
 	}
 	cmp := p.As == ACMPB || p.As == ACMPL || p.As == ACMPQ || p.As == ACMPW

 	cmpAddSub := p.As == AADDB || p.As == AADDL || p.As == AADDW || p.As == AADDQ ||
 		p.As == ASUBB || p.As == ASUBL || p.As == ASUBW || p.As == ASUBQ || cmp

 	testAnd := p.As == ATESTB || p.As == ATESTL || p.As == ATESTQ || p.As == ATESTW ||
 		p.As == AANDB || p.As == AANDL || p.As == AANDQ || p.As == AANDW

 	incDec := p.As == AINCB || p.As == AINCL || p.As == AINCQ || p.As == AINCW ||
 		p.As == ADECB || p.As == ADECL || p.As == ADECQ || p.As == ADECW

 	if !cmpAddSub && !testAnd && !incDec {
 		return false, 0
 	}

 	if !incDec {
 		var argOne obj.AddrType
 		var argTwo obj.AddrType
 		if cmp {
 			argOne = p.From.Type
 			argTwo = p.To.Type
 		} else {
 			argOne = p.To.Type
 			argTwo = p.From.Type
 		}
 		if argOne == obj.TYPE_REG {
 			if argTwo != obj.TYPE_REG && argTwo != obj.TYPE_CONST && argTwo != obj.TYPE_MEM {
 				return false, 0
 			}
 		} else if argOne == obj.TYPE_MEM {
 			if argTwo != obj.TYPE_REG {
 				return false, 0
 			}
 		} else {
 			return false, 0
 		}
 	}

 	fusedSize += p.Isize
 	jmp := lookForJCC(p)
 	if jmp == nil {
 		return false, 0
 	}

 	fusedSize += jmp.Isize

 	if testAnd {
 		return true, fusedSize
 	}

 	if jmp.As == AJOC || jmp.As == AJOS || jmp.As == AJMI ||
 		jmp.As == AJPL || jmp.As == AJPS || jmp.As == AJPC {
 		return false, 0
 	}

 	if cmpAddSub {
 		return true, fusedSize
 	}

 	if jmp.As == AJCS || jmp.As == AJCC || jmp.As == AJHI || jmp.As == AJLS {
 		return false, 0
 	}

 	return true, fusedSize
 }

 type padJumpsCtx int32

 func makePjcCtx(ctxt *obj.Link) padJumpsCtx {
 	// Disable jump padding on 32 bit builds by settting
 	// padJumps to 0.
 	if ctxt.Arch.Family == sys.I386 {
 		return padJumpsCtx(0)
 	}

 	// Disable jump padding for hand written assembly code.
 	if ctxt.IsAsm {
 		return padJumpsCtx(0)
 	}

 	return padJumpsCtx(32)
 }

 // padJump detects whether the instruction being assembled is a standalone or a macro-fused
 // jump that needs to be padded. If it is, NOPs are inserted to ensure that the jump does
 // not cross or end on a 32 byte boundary.
 func (pjc padJumpsCtx) padJump(ctxt *obj.Link, s *obj.LSym, p *obj.Prog, c int32) int32 {
 	if pjc == 0 {
 		return c
 	}

 	var toPad int32
 	fj, fjSize := fusedJump(p)
 	mask := int32(pjc - 1)
 	if fj {
 		if (c&mask)+int32(fjSize) >= int32(pjc) {
 			toPad = int32(pjc) - (c & mask)
 		}
 	} else if isJump(p) {
 		if (c&mask)+int32(p.Isize) >= int32(pjc) {
 			toPad = int32(pjc) - (c & mask)
 		}
 	}
 	if toPad <= 0 {
 		return c
 	}

 	return noppad(ctxt, s, c, toPad)
 }

 // reAssemble is called if an instruction's size changes during assembly. If
 // it does and the instruction is a standalone or a macro-fused jump we need to
 // reassemble.
 func (pjc padJumpsCtx) reAssemble(p *obj.Prog) bool {
 	if pjc == 0 {
 		return false
 	}

 	fj, _ := fusedJump(p)
 	return fj || isJump(p)
 }

 type nopPad struct {
 	p *obj.Prog // Instruction before the pad
 	n int32     // Size of the pad
 }

 func span6(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 	if ctxt.Retpoline && ctxt.Arch.Family == sys.I386 {
 		ctxt.Diag("-spectre=ret not supported on 386")
 		ctxt.Retpoline = false // don't keep printing
 	}

 	pjc := makePjcCtx(ctxt)

 	if s.P != nil {
 		return
 	}

 	if ycover[0] == 0 {
 		ctxt.Diag("x86 tables not initialized, call x86.instinit first")
 	}

 	for p := s.Func().Text; p != nil; p = p.Link {
 		if p.To.Type == obj.TYPE_BRANCH && p.To.Target() == nil {
 			p.To.SetTarget(p)
 		}
 		if p.As == AADJSP {
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = REG_SP
 			// Generate 'ADDQ $x, SP' or 'SUBQ $x, SP', with x positive.
 			// One exception: It is smaller to encode $-0x80 than $0x80.
 			// For that case, flip the sign and the op:
 			// Instead of 'ADDQ $0x80, SP', generate 'SUBQ $-0x80, SP'.
 			switch v := p.From.Offset; {
 			case v == 0:
 				p.As = obj.ANOP
 			case v == 0x80 || (v < 0 && v != -0x80):
 				p.As = spadjop(ctxt, AADDL, AADDQ)
 				p.From.Offset *= -1
 			default:
 				p.As = spadjop(ctxt, ASUBL, ASUBQ)
 			}
 		}
 		if ctxt.Retpoline && (p.As == obj.ACALL || p.As == obj.AJMP) && (p.To.Type == obj.TYPE_REG || p.To.Type == obj.TYPE_MEM) {
 			if p.To.Type != obj.TYPE_REG {
 				ctxt.Diag("non-retpoline-compatible: %v", p)
 				continue
 			}
 			p.To.Type = obj.TYPE_BRANCH
 			p.To.Name = obj.NAME_EXTERN
 			p.To.Sym = ctxt.Lookup("runtime.retpoline" + obj.Rconv(int(p.To.Reg)))
 			p.To.Reg = 0
 			p.To.Offset = 0
 		}
 	}

 	var count int64 // rough count of number of instructions
 	for p := s.Func().Text; p != nil; p = p.Link {
 		count++
 		p.Back = branchShort // use short branches first time through
 		if q := p.To.Target(); q != nil && (q.Back&branchShort != 0) {
 			p.Back |= branchBackwards
 			q.Back |= branchLoopHead
 		}
 	}
 	s.GrowCap(count * 5) // preallocate roughly 5 bytes per instruction

 	var ab AsmBuf
 	var n int
 	var c int32
 	errors := ctxt.Errors
 	var nops []nopPad // Padding for a particular assembly (reuse slice storage if multiple assemblies)
 	nrelocs0 := len(s.R)
 	for {
 		// This loop continues while there are reasons to re-assemble
 		// whole block, like the presence of long forward jumps.
 		reAssemble := false
 		for i := range s.R[nrelocs0:] {
 			s.R[nrelocs0+i] = obj.Reloc{}
 		}
 		s.R = s.R[:nrelocs0] // preserve marker relocations generated by the compiler
 		s.P = s.P[:0]
 		c = 0
 		var pPrev *obj.Prog
 		nops = nops[:0]
 		for p := s.Func().Text; p != nil; p = p.Link {
 			c0 := c
 			c = pjc.padJump(ctxt, s, p, c)

 			if maxLoopPad > 0 && p.Back&branchLoopHead != 0 && c&(loopAlign-1) != 0 {
 				// pad with NOPs
 				v := -c & (loopAlign - 1)

 				if v <= maxLoopPad {
 					s.Grow(int64(c) + int64(v))
 					fillnop(s.P[c:], int(v))
 					c += v
 				}
 			}

 			p.Pc = int64(c)

 			// process forward jumps to p
 			for q := p.Rel; q != nil; q = q.Forwd {
 				v := int32(p.Pc - (q.Pc + int64(q.Isize)))
 				if q.Back&branchShort != 0 {
 					if v > 127 {
 						reAssemble = true
 						q.Back ^= branchShort
 					}

 					if q.As == AJCXZL || q.As == AXBEGIN {
 						s.P[q.Pc+2] = byte(v)
 					} else {
 						s.P[q.Pc+1] = byte(v)
 					}
 				} else {
 					binary.LittleEndian.PutUint32(s.P[q.Pc+int64(q.Isize)-4:], uint32(v))
 				}
 			}

 			p.Rel = nil

 			p.Pc = int64(c)
 			ab.asmins(ctxt, s, p)
 			m := ab.Len()
 			if int(p.Isize) != m {
 				p.Isize = uint8(m)
 				if pjc.reAssemble(p) {
 					// We need to re-assemble here to check for jumps and fused jumps
 					// that span or end on 32 byte boundaries.
 					reAssemble = true
 				}
 			}

 			s.Grow(p.Pc + int64(m))
 			copy(s.P[p.Pc:], ab.Bytes())
 			// If there was padding, remember it.
 			if pPrev != nil && !ctxt.IsAsm && c > c0 {
 				nops = append(nops, nopPad{p: pPrev, n: c - c0})
 			}
 			c += int32(m)
 			pPrev = p
 		}

 		n++
 		if n > 20 {
 			ctxt.Diag("span must be looping")
 			log.Fatalf("loop")
 		}
 		if !reAssemble {
 			break
 		}
 		if ctxt.Errors > errors {
 			return
 		}
 	}
 	// splice padding nops into Progs
 	for _, n := range nops {
 		pp := n.p
 		np := &obj.Prog{Link: pp.Link, Ctxt: pp.Ctxt, As: obj.ANOP, Pos: pp.Pos.WithNotStmt(), Pc: pp.Pc + int64(pp.Isize), Isize: uint8(n.n)}
 		pp.Link = np
 	}

 	s.Size = int64(c)

 	if false { /* debug['a'] > 1 */
 		fmt.Printf("span1 %s %d (%d tries)\n %.6x", s.Name, s.Size, n, 0)
 		var i int
 		for i = 0; i < len(s.P); i++ {
 			fmt.Printf(" %.2x", s.P[i])
 			if i%16 == 15 {
 				fmt.Printf("\n  %.6x", uint(i+1))
 			}
 		}

 		if i%16 != 0 {
 			fmt.Printf("\n")
 		}

 		for i := 0; i < len(s.R); i++ {
 			r := &s.R[i]
 			fmt.Printf(" rel %#.4x/%d %s%+d\n", uint32(r.Off), r.Siz, r.Sym.Name, r.Add)
 		}
 	}

 	// Mark nonpreemptible instruction sequences.
 	// The 2-instruction TLS access sequence
 	//	MOVQ TLS, BX
 	//	MOVQ 0(BX)(TLS*1), BX
 	// is not async preemptible, as if it is preempted and resumed on
 	// a different thread, the TLS address may become invalid.
 	if !CanUse1InsnTLS(ctxt) {
 		useTLS := func(p *obj.Prog) bool {
 			// Only need to mark the second instruction, which has
 			// REG_TLS as Index. (It is okay to interrupt and restart
 			// the first instruction.)
 			return p.From.Index == REG_TLS
 		}
 		obj.MarkUnsafePoints(ctxt, s.Func().Text, newprog, useTLS, nil)
 	}
 }

 func instinit(ctxt *obj.Link) {
 	if ycover[0] != 0 {
 		// Already initialized; stop now.
 		// This happens in the cmd/asm tests,
 		// each of which re-initializes the arch.
 		return
 	}

 	switch ctxt.Headtype {
 	case objabi.Hplan9:
 		plan9privates = ctxt.Lookup("_privates")
 	}

 	for i := range avxOptab {
 		c := avxOptab[i].as
 		if opindex[c&obj.AMask] != nil {
 			ctxt.Diag("phase error in avxOptab: %d (%v)", i, c)
 		}
 		opindex[c&obj.AMask] = &avxOptab[i]
 	}
 	for i := 1; optab[i].as != 0; i++ {
 		c := optab[i].as
 		if opindex[c&obj.AMask] != nil {
 			ctxt.Diag("phase error in optab: %d (%v)", i, c)
 		}
 		opindex[c&obj.AMask] = &optab[i]
 	}

 	for i := 0; i < Ymax; i++ {
 		ycover[i*Ymax+i] = 1
 	}

 	ycover[Yi0*Ymax+Yu2] = 1
 	ycover[Yi1*Ymax+Yu2] = 1

 	ycover[Yi0*Ymax+Yi8] = 1
 	ycover[Yi1*Ymax+Yi8] = 1
 	ycover[Yu2*Ymax+Yi8] = 1
 	ycover[Yu7*Ymax+Yi8] = 1

 	ycover[Yi0*Ymax+Yu7] = 1
 	ycover[Yi1*Ymax+Yu7] = 1
 	ycover[Yu2*Ymax+Yu7] = 1

 	ycover[Yi0*Ymax+Yu8] = 1
 	ycover[Yi1*Ymax+Yu8] = 1
 	ycover[Yu2*Ymax+Yu8] = 1
 	ycover[Yu7*Ymax+Yu8] = 1

 	ycover[Yi0*Ymax+Ys32] = 1
 	ycover[Yi1*Ymax+Ys32] = 1
 	ycover[Yu2*Ymax+Ys32] = 1
 	ycover[Yu7*Ymax+Ys32] = 1
 	ycover[Yu8*Ymax+Ys32] = 1
 	ycover[Yi8*Ymax+Ys32] = 1

 	ycover[Yi0*Ymax+Yi32] = 1
 	ycover[Yi1*Ymax+Yi32] = 1
 	ycover[Yu2*Ymax+Yi32] = 1
 	ycover[Yu7*Ymax+Yi32] = 1
 	ycover[Yu8*Ymax+Yi32] = 1
 	ycover[Yi8*Ymax+Yi32] = 1
 	ycover[Ys32*Ymax+Yi32] = 1

 	ycover[Yi0*Ymax+Yi64] = 1
 	ycover[Yi1*Ymax+Yi64] = 1
 	ycover[Yu7*Ymax+Yi64] = 1
 	ycover[Yu2*Ymax+Yi64] = 1
 	ycover[Yu8*Ymax+Yi64] = 1
 	ycover[Yi8*Ymax+Yi64] = 1
 	ycover[Ys32*Ymax+Yi64] = 1
 	ycover[Yi32*Ymax+Yi64] = 1

 	ycover[Yal*Ymax+Yrb] = 1
 	ycover[Ycl*Ymax+Yrb] = 1
 	ycover[Yax*Ymax+Yrb] = 1
 	ycover[Ycx*Ymax+Yrb] = 1
 	ycover[Yrx*Ymax+Yrb] = 1
 	ycover[Yrl*Ymax+Yrb] = 1 // but not Yrl32

 	ycover[Ycl*Ymax+Ycx] = 1

 	ycover[Yax*Ymax+Yrx] = 1
 	ycover[Ycx*Ymax+Yrx] = 1

 	ycover[Yax*Ymax+Yrl] = 1
 	ycover[Ycx*Ymax+Yrl] = 1
 	ycover[Yrx*Ymax+Yrl] = 1
 	ycover[Yrl32*Ymax+Yrl] = 1

 	ycover[Yf0*Ymax+Yrf] = 1

 	ycover[Yal*Ymax+Ymb] = 1
 	ycover[Ycl*Ymax+Ymb] = 1
 	ycover[Yax*Ymax+Ymb] = 1
 	ycover[Ycx*Ymax+Ymb] = 1
 	ycover[Yrx*Ymax+Ymb] = 1
 	ycover[Yrb*Ymax+Ymb] = 1
 	ycover[Yrl*Ymax+Ymb] = 1 // but not Yrl32
 	ycover[Ym*Ymax+Ymb] = 1

 	ycover[Yax*Ymax+Yml] = 1
 	ycover[Ycx*Ymax+Yml] = 1
 	ycover[Yrx*Ymax+Yml] = 1
 	ycover[Yrl*Ymax+Yml] = 1
 	ycover[Yrl32*Ymax+Yml] = 1
 	ycover[Ym*Ymax+Yml] = 1

 	ycover[Yax*Ymax+Ymm] = 1
 	ycover[Ycx*Ymax+Ymm] = 1
 	ycover[Yrx*Ymax+Ymm] = 1
 	ycover[Yrl*Ymax+Ymm] = 1
 	ycover[Yrl32*Ymax+Ymm] = 1
 	ycover[Ym*Ymax+Ymm] = 1
 	ycover[Ymr*Ymax+Ymm] = 1

 	ycover[Yxr0*Ymax+Yxr] = 1

 	ycover[Ym*Ymax+Yxm] = 1
 	ycover[Yxr0*Ymax+Yxm] = 1
 	ycover[Yxr*Ymax+Yxm] = 1

 	ycover[Ym*Ymax+Yym] = 1
 	ycover[Yyr*Ymax+Yym] = 1

 	ycover[Yxr0*Ymax+YxrEvex] = 1
 	ycover[Yxr*Ymax+YxrEvex] = 1

 	ycover[Ym*Ymax+YxmEvex] = 1
 	ycover[Yxr0*Ymax+YxmEvex] = 1
 	ycover[Yxr*Ymax+YxmEvex] = 1
 	ycover[YxrEvex*Ymax+YxmEvex] = 1

 	ycover[Yyr*Ymax+YyrEvex] = 1

 	ycover[Ym*Ymax+YymEvex] = 1
 	ycover[Yyr*Ymax+YymEvex] = 1
 	ycover[YyrEvex*Ymax+YymEvex] = 1

 	ycover[Ym*Ymax+Yzm] = 1
 	ycover[Yzr*Ymax+Yzm] = 1

 	ycover[Yk0*Ymax+Yk] = 1
 	ycover[Yknot0*Ymax+Yk] = 1

 	ycover[Yk0*Ymax+Ykm] = 1
 	ycover[Yknot0*Ymax+Ykm] = 1
 	ycover[Yk*Ymax+Ykm] = 1
 	ycover[Ym*Ymax+Ykm] = 1

 	ycover[Yxvm*Ymax+YxvmEvex] = 1

 	ycover[Yyvm*Ymax+YyvmEvex] = 1

 	for i := 0; i < MAXREG; i++ {
 		reg[i] = -1
 		if i >= REG_AL && i <= REG_R15B {
 			reg[i] = (i - REG_AL) & 7
 			if i >= REG_SPB && i <= REG_DIB {
 				regrex[i] = 0x40
 			}
 			if i >= REG_R8B && i <= REG_R15B {
 				regrex[i] = Rxr | Rxx | Rxb
 			}
 		}

 		if i >= REG_AH && i <= REG_BH {
 			reg[i] = 4 + ((i - REG_AH) & 7)
 		}
 		if i >= REG_AX && i <= REG_R15 {
 			reg[i] = (i - REG_AX) & 7
 			if i >= REG_R8 {
 				regrex[i] = Rxr | Rxx | Rxb
 			}
 		}

 		if i >= REG_F0 && i <= REG_F0+7 {
 			reg[i] = (i - REG_F0) & 7
 		}
 		if i >= REG_M0 && i <= REG_M0+7 {
 			reg[i] = (i - REG_M0) & 7
 		}
 		if i >= REG_K0 && i <= REG_K0+7 {
 			reg[i] = (i - REG_K0) & 7
 		}
 		if i >= REG_X0 && i <= REG_X0+15 {
 			reg[i] = (i - REG_X0) & 7
 			if i >= REG_X0+8 {
 				regrex[i] = Rxr | Rxx | Rxb
 			}
 		}
 		if i >= REG_X16 && i <= REG_X16+15 {
 			reg[i] = (i - REG_X16) & 7
 			if i >= REG_X16+8 {
 				regrex[i] = Rxr | Rxx | Rxb | RxrEvex
 			} else {
 				regrex[i] = RxrEvex
 			}
 		}
 		if i >= REG_Y0 && i <= REG_Y0+15 {
 			reg[i] = (i - REG_Y0) & 7
 			if i >= REG_Y0+8 {
 				regrex[i] = Rxr | Rxx | Rxb
 			}
 		}
 		if i >= REG_Y16 && i <= REG_Y16+15 {
 			reg[i] = (i - REG_Y16) & 7
 			if i >= REG_Y16+8 {
 				regrex[i] = Rxr | Rxx | Rxb | RxrEvex
 			} else {
 				regrex[i] = RxrEvex
 			}
 		}
 		if i >= REG_Z0 && i <= REG_Z0+15 {
 			reg[i] = (i - REG_Z0) & 7
 			if i > REG_Z0+7 {
 				regrex[i] = Rxr | Rxx | Rxb
 			}
 		}
 		if i >= REG_Z16 && i <= REG_Z16+15 {
 			reg[i] = (i - REG_Z16) & 7
 			if i >= REG_Z16+8 {
 				regrex[i] = Rxr | Rxx | Rxb | RxrEvex
 			} else {
 				regrex[i] = RxrEvex
 			}
 		}

 		if i >= REG_CR+8 && i <= REG_CR+15 {
 			regrex[i] = Rxr
 		}
 	}
 }

 var isAndroid = buildcfg.GOOS == "android"

 func prefixof(ctxt *obj.Link, a *obj.Addr) int {
 	if a.Reg < REG_CS && a.Index < REG_CS { // fast path
 		return 0
 	}
 	if a.Type == obj.TYPE_MEM && a.Name == obj.NAME_NONE {
 		switch a.Reg {
 		case REG_CS:
 			return 0x2e

 		case REG_DS:
 			return 0x3e

 		case REG_ES:
 			return 0x26

 		case REG_FS:
 			return 0x64

 		case REG_GS:
 			return 0x65

 		case REG_TLS:
 			// NOTE: Systems listed here should be only systems that
 			// support direct TLS references like 8(TLS) implemented as
 			// direct references from FS or GS. Systems that require
 			// the initial-exec model, where you load the TLS base into
 			// a register and then index from that register, do not reach
 			// this code and should not be listed.
 			if ctxt.Arch.Family == sys.I386 {
 				switch ctxt.Headtype {
 				default:
 					if isAndroid {
 						return 0x65 // GS
 					}
 					log.Fatalf("unknown TLS base register for %v", ctxt.Headtype)

 				case objabi.Hdarwin,
 					objabi.Hdragonfly,
 					objabi.Hfreebsd,
 					objabi.Hnetbsd,
 					objabi.Hopenbsd:
 					return 0x65 // GS
 				}
 			}

 			switch ctxt.Headtype {
 			default:
 				log.Fatalf("unknown TLS base register for %v", ctxt.Headtype)

 			case objabi.Hlinux:
 				if isAndroid {
 					return 0x64 // FS
 				}

 				if ctxt.Flag_shared {
 					log.Fatalf("unknown TLS base register for linux with -shared")
 				} else {
 					return 0x64 // FS
 				}

 			case objabi.Hdragonfly,
 				objabi.Hfreebsd,
 				objabi.Hnetbsd,
 				objabi.Hopenbsd,
 				objabi.Hsolaris:
 				return 0x64 // FS

 			case objabi.Hdarwin:
 				return 0x65 // GS
 			}
 		}
 	}

 	if ctxt.Arch.Family == sys.I386 {
 		if a.Index == REG_TLS && ctxt.Flag_shared {
 			// When building for inclusion into a shared library, an instruction of the form
 			//     MOVL off(CX)(TLS*1), AX
 			// becomes
 			//     mov %gs:off(%ecx), %eax
 			// which assumes that the correct TLS offset has been loaded into %ecx (today
 			// there is only one TLS variable -- g -- so this is OK). When not building for
 			// a shared library the instruction it becomes
 			//     mov 0x0(%ecx), %eax
 			// and a R_TLS_LE relocation, and so does not require a prefix.
 			return 0x65 // GS
 		}
 		return 0
 	}

 	switch a.Index {
 	case REG_CS:
 		return 0x2e

 	case REG_DS:
 		return 0x3e

 	case REG_ES:
 		return 0x26

 	case REG_TLS:
 		if ctxt.Flag_shared && ctxt.Headtype != objabi.Hwindows {
 			// When building for inclusion into a shared library, an instruction of the form
 			//     MOV off(CX)(TLS*1), AX
 			// becomes
 			//     mov %fs:off(%rcx), %rax
 			// which assumes that the correct TLS offset has been loaded into %rcx (today
 			// there is only one TLS variable -- g -- so this is OK). When not building for
 			// a shared library the instruction does not require a prefix.
 			return 0x64
 		}

 	case REG_FS:
 		return 0x64

 	case REG_GS:
 		return 0x65
 	}

 	return 0
 }

 // oclassRegList returns multisource operand class for addr.
 func oclassRegList(ctxt *obj.Link, addr *obj.Addr) int {
 	// TODO(quasilyte): when oclass register case is refactored into
 	// lookup table, use it here to get register kind more easily.
 	// Helper functions like regIsXmm should go away too (they will become redundant).

 	regIsXmm := func(r int) bool { return r >= REG_X0 && r <= REG_X31 }
 	regIsYmm := func(r int) bool { return r >= REG_Y0 && r <= REG_Y31 }
 	regIsZmm := func(r int) bool { return r >= REG_Z0 && r <= REG_Z31 }

 	reg0, reg1 := decodeRegisterRange(addr.Offset)
 	low := regIndex(int16(reg0))
 	high := regIndex(int16(reg1))

 	if ctxt.Arch.Family == sys.I386 {
 		if low >= 8 || high >= 8 {
 			return Yxxx
 		}
 	}

 	switch high - low {
 	case 3:
 		switch {
 		case regIsXmm(reg0) && regIsXmm(reg1):
 			return YxrEvexMulti4
 		case regIsYmm(reg0) && regIsYmm(reg1):
 			return YyrEvexMulti4
 		case regIsZmm(reg0) && regIsZmm(reg1):
 			return YzrMulti4
 		default:
 			return Yxxx
 		}
 	default:
 		return Yxxx
 	}
 }

 // oclassVMem returns V-mem (vector memory with VSIB) operand class.
 // For addr that is not V-mem returns (Yxxx, false).
 func oclassVMem(ctxt *obj.Link, addr *obj.Addr) (int, bool) {
 	switch addr.Index {
 	case REG_X0 + 0,
 		REG_X0 + 1,
 		REG_X0 + 2,
 		REG_X0 + 3,
 		REG_X0 + 4,
 		REG_X0 + 5,
 		REG_X0 + 6,
 		REG_X0 + 7:
 		return Yxvm, true
 	case REG_X8 + 0,
 		REG_X8 + 1,
 		REG_X8 + 2,
 		REG_X8 + 3,
 		REG_X8 + 4,
 		REG_X8 + 5,
 		REG_X8 + 6,
 		REG_X8 + 7:
 		if ctxt.Arch.Family == sys.I386 {
 			return Yxxx, true
 		}
 		return Yxvm, true
 	case REG_X16 + 0,
 		REG_X16 + 1,
 		REG_X16 + 2,
 		REG_X16 + 3,
 		REG_X16 + 4,
 		REG_X16 + 5,
 		REG_X16 + 6,
 		REG_X16 + 7,
 		REG_X16 + 8,
 		REG_X16 + 9,
 		REG_X16 + 10,
 		REG_X16 + 11,
 		REG_X16 + 12,
 		REG_X16 + 13,
 		REG_X16 + 14,
 		REG_X16 + 15:
 		if ctxt.Arch.Family == sys.I386 {
 			return Yxxx, true
 		}
 		return YxvmEvex, true

 	case REG_Y0 + 0,
 		REG_Y0 + 1,
 		REG_Y0 + 2,
 		REG_Y0 + 3,
 		REG_Y0 + 4,
 		REG_Y0 + 5,
 		REG_Y0 + 6,
 		REG_Y0 + 7:
 		return Yyvm, true
 	case REG_Y8 + 0,
 		REG_Y8 + 1,
 		REG_Y8 + 2,
 		REG_Y8 + 3,
 		REG_Y8 + 4,
 		REG_Y8 + 5,
 		REG_Y8 + 6,
 		REG_Y8 + 7:
 		if ctxt.Arch.Family == sys.I386 {
 			return Yxxx, true
 		}
 		return Yyvm, true
 	case REG_Y16 + 0,
 		REG_Y16 + 1,
 		REG_Y16 + 2,
 		REG_Y16 + 3,
 		REG_Y16 + 4,
 		REG_Y16 + 5,
 		REG_Y16 + 6,
 		REG_Y16 + 7,
 		REG_Y16 + 8,
 		REG_Y16 + 9,
 		REG_Y16 + 10,
 		REG_Y16 + 11,
 		REG_Y16 + 12,
 		REG_Y16 + 13,
 		REG_Y16 + 14,
 		REG_Y16 + 15:
 		if ctxt.Arch.Family == sys.I386 {
 			return Yxxx, true
 		}
 		return YyvmEvex, true

 	case REG_Z0 + 0,
 		REG_Z0 + 1,
 		REG_Z0 + 2,
 		REG_Z0 + 3,
 		REG_Z0 + 4,
 		REG_Z0 + 5,
 		REG_Z0 + 6,
 		REG_Z0 + 7:
 		return Yzvm, true
 	case REG_Z8 + 0,
 		REG_Z8 + 1,
 		REG_Z8 + 2,
 		REG_Z8 + 3,
 		REG_Z8 + 4,
 		REG_Z8 + 5,
 		REG_Z8 + 6,
 		REG_Z8 + 7,
 		REG_Z8 + 8,
 		REG_Z8 + 9,
 		REG_Z8 + 10,
 		REG_Z8 + 11,
 		REG_Z8 + 12,
 		REG_Z8 + 13,
 		REG_Z8 + 14,
 		REG_Z8 + 15,
 		REG_Z8 + 16,
 		REG_Z8 + 17,
 		REG_Z8 + 18,
 		REG_Z8 + 19,
 		REG_Z8 + 20,
 		REG_Z8 + 21,
 		REG_Z8 + 22,
 		REG_Z8 + 23:
 		if ctxt.Arch.Family == sys.I386 {
 			return Yxxx, true
 		}
 		return Yzvm, true
 	}

 	return Yxxx, false
 }

 func oclass(ctxt *obj.Link, p *obj.Prog, a *obj.Addr) int {
 	switch a.Type {
 	case obj.TYPE_REGLIST:
 		return oclassRegList(ctxt, a)

 	case obj.TYPE_NONE:
 		return Ynone

 	case obj.TYPE_BRANCH:
 		return Ybr

 	case obj.TYPE_INDIR:
 		if a.Name != obj.NAME_NONE && a.Reg == REG_NONE && a.Index == REG_NONE && a.Scale == 0 {
 			return Yindir
 		}
 		return Yxxx

 	case obj.TYPE_MEM:
 		// Pseudo registers have negative index, but SP is
 		// not pseudo on x86, hence REG_SP check is not redundant.
 		if a.Index == REG_SP || a.Index < 0 {
 			// Can't use FP/SB/PC/SP as the index register.
 			return Yxxx
 		}

 		if vmem, ok := oclassVMem(ctxt, a); ok {
 			return vmem
 		}

 		if ctxt.Arch.Family == sys.AMD64 {
 			switch a.Name {
 			case obj.NAME_EXTERN, obj.NAME_STATIC, obj.NAME_GOTREF:
 				// Global variables can't use index registers and their
 				// base register is %rip (%rip is encoded as REG_NONE).
 				if a.Reg != REG_NONE || a.Index != REG_NONE || a.Scale != 0 {
 					return Yxxx
 				}
 			case obj.NAME_AUTO, obj.NAME_PARAM:
 				// These names must have a base of SP.  The old compiler
 				// uses 0 for the base register. SSA uses REG_SP.
 				if a.Reg != REG_SP && a.Reg != 0 {
 					return Yxxx
 				}
 			case obj.NAME_NONE:
 				// everything is ok
 			default:
 				// unknown name
 				return Yxxx
 			}
 		}
 		return Ym

 	case obj.TYPE_ADDR:
 		switch a.Name {
 		case obj.NAME_GOTREF:
 			ctxt.Diag("unexpected TYPE_ADDR with NAME_GOTREF")
 			return Yxxx

 		case obj.NAME_EXTERN,
 			obj.NAME_STATIC:
 			if a.Sym != nil && useAbs(ctxt, a.Sym) {
 				return Yi32
 			}
 			return Yiauto // use pc-relative addressing

 		case obj.NAME_AUTO,
 			obj.NAME_PARAM:
 			return Yiauto
 		}

 		// TODO(rsc): DUFFZERO/DUFFCOPY encoding forgot to set a->index
 		// and got Yi32 in an earlier version of this code.
 		// Keep doing that until we fix yduff etc.
 		if a.Sym != nil && strings.HasPrefix(a.Sym.Name, "runtime.duff") {
 			return Yi32
 		}

 		if a.Sym != nil || a.Name != obj.NAME_NONE {
 			ctxt.Diag("unexpected addr: %v", obj.Dconv(p, a))
 		}
 		fallthrough

 	case obj.TYPE_CONST:
 		if a.Sym != nil {
 			ctxt.Diag("TYPE_CONST with symbol: %v", obj.Dconv(p, a))
 		}

 		v := a.Offset
 		if ctxt.Arch.Family == sys.I386 {
 			v = int64(int32(v))
 		}
 		switch {
 		case v == 0:
 			return Yi0
 		case v == 1:
 			return Yi1
 		case v >= 0 && v <= 3:
 			return Yu2
 		case v >= 0 && v <= 127:
 			return Yu7
 		case v >= 0 && v <= 255:
 			return Yu8
 		case v >= -128 && v <= 127:
 			return Yi8
 		}
 		if ctxt.Arch.Family == sys.I386 {
 			return Yi32
 		}
 		l := int32(v)
 		if int64(l) == v {
 			return Ys32 // can sign extend
 		}
 		if v>>32 == 0 {
 			return Yi32 // unsigned
 		}
 		return Yi64

 	case obj.TYPE_TEXTSIZE:
 		return Ytextsize
 	}

 	if a.Type != obj.TYPE_REG {
 		ctxt.Diag("unexpected addr1: type=%d %v", a.Type, obj.Dconv(p, a))
 		return Yxxx
 	}

 	switch a.Reg {
 	case REG_AL:
 		return Yal

 	case REG_AX:
 		return Yax

 		/*
 			case REG_SPB:
 		*/
 	case REG_BPB,
 		REG_SIB,
 		REG_DIB,
 		REG_R8B,
 		REG_R9B,
 		REG_R10B,
 		REG_R11B,
 		REG_R12B,
 		REG_R13B,
 		REG_R14B,
 		REG_R15B:
 		if ctxt.Arch.Family == sys.I386 {
 			return Yxxx
 		}
 		fallthrough

 	case REG_DL,
 		REG_BL,
 		REG_AH,
 		REG_CH,
 		REG_DH,
 		REG_BH:
 		return Yrb

 	case REG_CL:
 		return Ycl

 	case REG_CX:
 		return Ycx

 	case REG_DX, REG_BX:
 		return Yrx

 	case REG_R8, // not really Yrl
 		REG_R9,
 		REG_R10,
 		REG_R11,
 		REG_R12,
 		REG_R13,
 		REG_R14,
 		REG_R15:
 		if ctxt.Arch.Family == sys.I386 {
 			return Yxxx
 		}
 		fallthrough

 	case REG_SP, REG_BP, REG_SI, REG_DI:
 		if ctxt.Arch.Family == sys.I386 {
 			return Yrl32
 		}
 		return Yrl

 	case REG_F0 + 0:
 		return Yf0

 	case REG_F0 + 1,
 		REG_F0 + 2,
 		REG_F0 + 3,
 		REG_F0 + 4,
 		REG_F0 + 5,
 		REG_F0 + 6,
 		REG_F0 + 7:
 		return Yrf

 	case REG_M0 + 0,
 		REG_M0 + 1,
 		REG_M0 + 2,
 		REG_M0 + 3,
 		REG_M0 + 4,
 		REG_M0 + 5,
 		REG_M0 + 6,
 		REG_M0 + 7:
 		return Ymr

 	case REG_X0:
 		return Yxr0

 	case REG_X0 + 1,
 		REG_X0 + 2,
 		REG_X0 + 3,
 		REG_X0 + 4,
 		REG_X0 + 5,
 		REG_X0 + 6,
 		REG_X0 + 7,
 		REG_X0 + 8,
 		REG_X0 + 9,
 		REG_X0 + 10,
 		REG_X0 + 11,
 		REG_X0 + 12,
 		REG_X0 + 13,
 		REG_X0 + 14,
 		REG_X0 + 15:
 		return Yxr

 	case REG_X0 + 16,
 		REG_X0 + 17,
 		REG_X0 + 18,
 		REG_X0 + 19,
 		REG_X0 + 20,
 		REG_X0 + 21,
 		REG_X0 + 22,
 		REG_X0 + 23,
 		REG_X0 + 24,
 		REG_X0 + 25,
 		REG_X0 + 26,
 		REG_X0 + 27,
 		REG_X0 + 28,
 		REG_X0 + 29,
 		REG_X0 + 30,
 		REG_X0 + 31:
 		return YxrEvex

 	case REG_Y0 + 0,
 		REG_Y0 + 1,
 		REG_Y0 + 2,
 		REG_Y0 + 3,
 		REG_Y0 + 4,
 		REG_Y0 + 5,
 		REG_Y0 + 6,
 		REG_Y0 + 7,
 		REG_Y0 + 8,
 		REG_Y0 + 9,
 		REG_Y0 + 10,
 		REG_Y0 + 11,
 		REG_Y0 + 12,
 		REG_Y0 + 13,
 		REG_Y0 + 14,
 		REG_Y0 + 15:
 		return Yyr

 	case REG_Y0 + 16,
 		REG_Y0 + 17,
 		REG_Y0 + 18,
 		REG_Y0 + 19,
 		REG_Y0 + 20,
 		REG_Y0 + 21,
 		REG_Y0 + 22,
 		REG_Y0 + 23,
 		REG_Y0 + 24,
 		REG_Y0 + 25,
 		REG_Y0 + 26,
 		REG_Y0 + 27,
 		REG_Y0 + 28,
 		REG_Y0 + 29,
 		REG_Y0 + 30,
 		REG_Y0 + 31:
 		return YyrEvex

 	case REG_Z0 + 0,
 		REG_Z0 + 1,
 		REG_Z0 + 2,
 		REG_Z0 + 3,
 		REG_Z0 + 4,
 		REG_Z0 + 5,
 		REG_Z0 + 6,
 		REG_Z0 + 7:
 		return Yzr

 	case REG_Z0 + 8,
 		REG_Z0 + 9,
 		REG_Z0 + 10,
 		REG_Z0 + 11,
 		REG_Z0 + 12,
 		REG_Z0 + 13,
 		REG_Z0 + 14,
 		REG_Z0 + 15,
 		REG_Z0 + 16,
 		REG_Z0 + 17,
 		REG_Z0 + 18,
 		REG_Z0 + 19,
 		REG_Z0 + 20,
 		REG_Z0 + 21,
 		REG_Z0 + 22,
 		REG_Z0 + 23,
 		REG_Z0 + 24,
 		REG_Z0 + 25,
 		REG_Z0 + 26,
 		REG_Z0 + 27,
 		REG_Z0 + 28,
 		REG_Z0 + 29,
 		REG_Z0 + 30,
 		REG_Z0 + 31:
 		if ctxt.Arch.Family == sys.I386 {
 			return Yxxx
 		}
 		return Yzr

 	case REG_K0:
 		return Yk0

 	case REG_K0 + 1,
 		REG_K0 + 2,
 		REG_K0 + 3,
 		REG_K0 + 4,
 		REG_K0 + 5,
 		REG_K0 + 6,
 		REG_K0 + 7:
 		return Yknot0

 	case REG_CS:
 		return Ycs
 	case REG_SS:
 		return Yss
 	case REG_DS:
 		return Yds
 	case REG_ES:
 		return Yes
 	case REG_FS:
 		return Yfs
 	case REG_GS:
 		return Ygs
 	case REG_TLS:
 		return Ytls

 	case REG_GDTR:
 		return Ygdtr
 	case REG_IDTR:
 		return Yidtr
 	case REG_LDTR:
 		return Yldtr
 	case REG_MSW:
 		return Ymsw
 	case REG_TASK:
 		return Ytask

 	case REG_CR + 0:
 		return Ycr0
 	case REG_CR + 1:
 		return Ycr1
 	case REG_CR + 2:
 		return Ycr2
 	case REG_CR + 3:
 		return Ycr3
 	case REG_CR + 4:
 		return Ycr4
 	case REG_CR + 5:
 		return Ycr5
 	case REG_CR + 6:
 		return Ycr6
 	case REG_CR + 7:
 		return Ycr7
 	case REG_CR + 8:
 		return Ycr8

 	case REG_DR + 0:
 		return Ydr0
 	case REG_DR + 1:
 		return Ydr1
 	case REG_DR + 2:
 		return Ydr2
 	case REG_DR + 3:
 		return Ydr3
 	case REG_DR + 4:
 		return Ydr4
 	case REG_DR + 5:
 		return Ydr5
 	case REG_DR + 6:
 		return Ydr6
 	case REG_DR + 7:
 		return Ydr7

 	case REG_TR + 0:
 		return Ytr0
 	case REG_TR + 1:
 		return Ytr1
 	case REG_TR + 2:
 		return Ytr2
 	case REG_TR + 3:
 		return Ytr3
 	case REG_TR + 4:
 		return Ytr4
 	case REG_TR + 5:
 		return Ytr5
 	case REG_TR + 6:
 		return Ytr6
 	case REG_TR + 7:
 		return Ytr7
 	}

 	return Yxxx
 }

 // AsmBuf is a simple buffer to assemble variable-length x86 instructions into
 // and hold assembly state.
 type AsmBuf struct {
 	buf      [100]byte
 	off      int
 	rexflag  int
 	vexflag  bool // Per inst: true for VEX-encoded
 	evexflag bool // Per inst: true for EVEX-encoded
 	rep      bool
 	repn     bool
 	lock     bool

 	evex evexBits // Initialized when evexflag is true
 }

 // Put1 appends one byte to the end of the buffer.
 func (ab *AsmBuf) Put1(x byte) {
 	ab.buf[ab.off] = x
 	ab.off++
 }

 // Put2 appends two bytes to the end of the buffer.
 func (ab *AsmBuf) Put2(x, y byte) {
 	ab.buf[ab.off+0] = x
 	ab.buf[ab.off+1] = y
 	ab.off += 2
 }

 // Put3 appends three bytes to the end of the buffer.
 func (ab *AsmBuf) Put3(x, y, z byte) {
 	ab.buf[ab.off+0] = x
 	ab.buf[ab.off+1] = y
 	ab.buf[ab.off+2] = z
 	ab.off += 3
 }

 // Put4 appends four bytes to the end of the buffer.
 func (ab *AsmBuf) Put4(x, y, z, w byte) {
 	ab.buf[ab.off+0] = x
 	ab.buf[ab.off+1] = y
 	ab.buf[ab.off+2] = z
 	ab.buf[ab.off+3] = w
 	ab.off += 4
 }

 // PutInt16 writes v into the buffer using little-endian encoding.
 func (ab *AsmBuf) PutInt16(v int16) {
 	ab.buf[ab.off+0] = byte(v)
 	ab.buf[ab.off+1] = byte(v >> 8)
 	ab.off += 2
 }

 // PutInt32 writes v into the buffer using little-endian encoding.
 func (ab *AsmBuf) PutInt32(v int32) {
 	ab.buf[ab.off+0] = byte(v)
 	ab.buf[ab.off+1] = byte(v >> 8)
 	ab.buf[ab.off+2] = byte(v >> 16)
 	ab.buf[ab.off+3] = byte(v >> 24)
 	ab.off += 4
 }

 // PutInt64 writes v into the buffer using little-endian encoding.
 func (ab *AsmBuf) PutInt64(v int64) {
 	ab.buf[ab.off+0] = byte(v)
 	ab.buf[ab.off+1] = byte(v >> 8)
 	ab.buf[ab.off+2] = byte(v >> 16)
 	ab.buf[ab.off+3] = byte(v >> 24)
 	ab.buf[ab.off+4] = byte(v >> 32)
 	ab.buf[ab.off+5] = byte(v >> 40)
 	ab.buf[ab.off+6] = byte(v >> 48)
 	ab.buf[ab.off+7] = byte(v >> 56)
 	ab.off += 8
 }

 // Put copies b into the buffer.
 func (ab *AsmBuf) Put(b []byte) {
 	copy(ab.buf[ab.off:], b)
 	ab.off += len(b)
 }

 // PutOpBytesLit writes zero terminated sequence of bytes from op,
 // starting at specified offset (e.g. z counter value).
 // Trailing 0 is not written.
 //
 // Intended to be used for literal Z cases.
 // Literal Z cases usually have "Zlit" in their name (Zlit, Zlitr_m, Zlitm_r).
 func (ab *AsmBuf) PutOpBytesLit(offset int, op *opBytes) {
 	for int(op[offset]) != 0 {
 		ab.Put1(byte(op[offset]))
 		offset++
 	}
 }

 // Insert inserts b at offset i.
 func (ab *AsmBuf) Insert(i int, b byte) {
 	ab.off++
 	copy(ab.buf[i+1:ab.off], ab.buf[i:ab.off-1])
 	ab.buf[i] = b
 }

 // Last returns the byte at the end of the buffer.
 func (ab *AsmBuf) Last() byte { return ab.buf[ab.off-1] }

 // Len returns the length of the buffer.
 func (ab *AsmBuf) Len() int { return ab.off }

 // Bytes returns the contents of the buffer.
 func (ab *AsmBuf) Bytes() []byte { return ab.buf[:ab.off] }

 // Reset empties the buffer.
 func (ab *AsmBuf) Reset() { ab.off = 0 }

 // At returns the byte at offset i.
 func (ab *AsmBuf) At(i int) byte { return ab.buf[i] }

 // asmidx emits SIB byte.
 func (ab *AsmBuf) asmidx(ctxt *obj.Link, scale int, index int, base int) {
 	var i int

 	// X/Y index register is used in VSIB.
 	switch index {
 	default:
 		goto bad

 	case REG_NONE:
 		i = 4 << 3
 		goto bas

 	case REG_R8,
 		REG_R9,
 		REG_R10,
 		REG_R11,
 		REG_R12,
 		REG_R13,
 		REG_R14,
 		REG_R15,
 		REG_X8,
 		REG_X9,
 		REG_X10,
 		REG_X11,
 		REG_X12,
 		REG_X13,
 		REG_X14,
 		REG_X15,
 		REG_X16,
 		REG_X17,
 		REG_X18,
 		REG_X19,
 		REG_X20,
 		REG_X21,
 		REG_X22,
 		REG_X23,
 		REG_X24,
 		REG_X25,
 		REG_X26,
 		REG_X27,
 		REG_X28,
 		REG_X29,
 		REG_X30,
 		REG_X31,
 		REG_Y8,
 		REG_Y9,
 		REG_Y10,
 		REG_Y11,
 		REG_Y12,
 		REG_Y13,
 		REG_Y14,
 		REG_Y15,
 		REG_Y16,
 		REG_Y17,
 		REG_Y18,
 		REG_Y19,
 		REG_Y20,
 		REG_Y21,
 		REG_Y22,
 		REG_Y23,
 		REG_Y24,
 		REG_Y25,
 		REG_Y26,
 		REG_Y27,
 		REG_Y28,
 		REG_Y29,
 		REG_Y30,
 		REG_Y31,
 		REG_Z8,
 		REG_Z9,
 		REG_Z10,
 		REG_Z11,
 		REG_Z12,
 		REG_Z13,
 		REG_Z14,
 		REG_Z15,
 		REG_Z16,
 		REG_Z17,
 		REG_Z18,
 		REG_Z19,
 		REG_Z20,
 		REG_Z21,
 		REG_Z22,
 		REG_Z23,
 		REG_Z24,
 		REG_Z25,
 		REG_Z26,
 		REG_Z27,
 		REG_Z28,
 		REG_Z29,
 		REG_Z30,
 		REG_Z31:
 		if ctxt.Arch.Family == sys.I386 {
 			goto bad
 		}
 		fallthrough

 	case REG_AX,
 		REG_CX,
 		REG_DX,
 		REG_BX,
 		REG_BP,
 		REG_SI,
 		REG_DI,
 		REG_X0,
 		REG_X1,
 		REG_X2,
 		REG_X3,
 		REG_X4,
 		REG_X5,
 		REG_X6,
 		REG_X7,
 		REG_Y0,
 		REG_Y1,
 		REG_Y2,
 		REG_Y3,
 		REG_Y4,
 		REG_Y5,
 		REG_Y6,
 		REG_Y7,
 		REG_Z0,
 		REG_Z1,
 		REG_Z2,
 		REG_Z3,
 		REG_Z4,
 		REG_Z5,
 		REG_Z6,
 		REG_Z7:
 		i = reg[index] << 3
 	}

 	switch scale {
 	default:
 		goto bad

 	case 1:
 		break

 	case 2:
 		i |= 1 << 6

 	case 4:
 		i |= 2 << 6

 	case 8:
 		i |= 3 << 6
 	}

 bas:
 	switch base {
 	default:
 		goto bad

 	case REG_NONE: // must be mod=00
 		i |= 5

 	case REG_R8,
 		REG_R9,
 		REG_R10,
 		REG_R11,
 		REG_R12,
 		REG_R13,
 		REG_R14,
 		REG_R15:
 		if ctxt.Arch.Family == sys.I386 {
 			goto bad
 		}
 		fallthrough

 	case REG_AX,
 		REG_CX,
 		REG_DX,
 		REG_BX,
 		REG_SP,
 		REG_BP,
 		REG_SI,
 		REG_DI:
 		i |= reg[base]
 	}

 	ab.Put1(byte(i))
 	return

 bad:
 	ctxt.Diag("asmidx: bad address %d/%d/%d", scale, index, base)
 	ab.Put1(0)
 }

 func (ab *AsmBuf) relput4(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr) {
 	var rel obj.Reloc

 	v := vaddr(ctxt, p, a, &rel)
 	if rel.Siz != 0 {
 		if rel.Siz != 4 {
 			ctxt.Diag("bad reloc")
 		}
 		r := obj.Addrel(cursym)
 		*r = rel
 		r.Off = int32(p.Pc + int64(ab.Len()))
 	}

 	ab.PutInt32(int32(v))
 }

 func vaddr(ctxt *obj.Link, p *obj.Prog, a *obj.Addr, r *obj.Reloc) int64 {
 	if r != nil {
 		*r = obj.Reloc{}
 	}

 	switch a.Name {
 	case obj.NAME_STATIC,
 		obj.NAME_GOTREF,
 		obj.NAME_EXTERN:
 		s := a.Sym
 		if r == nil {
 			ctxt.Diag("need reloc for %v", obj.Dconv(p, a))
 			log.Fatalf("reloc")
 		}

 		if a.Name == obj.NAME_GOTREF {
 			r.Siz = 4
 			r.Type = objabi.R_GOTPCREL
 		} else if useAbs(ctxt, s) {
 			r.Siz = 4
 			r.Type = objabi.R_ADDR
 		} else {
 			r.Siz = 4
 			r.Type = objabi.R_PCREL
 		}

 		r.Off = -1 // caller must fill in
 		r.Sym = s
 		r.Add = a.Offset

 		return 0
 	}

 	if (a.Type == obj.TYPE_MEM || a.Type == obj.TYPE_ADDR) && a.Reg == REG_TLS {
 		if r == nil {
 			ctxt.Diag("need reloc for %v", obj.Dconv(p, a))
 			log.Fatalf("reloc")
 		}

 		if !ctxt.Flag_shared || isAndroid || ctxt.Headtype == objabi.Hdarwin {
 			r.Type = objabi.R_TLS_LE
 			r.Siz = 4
 			r.Off = -1 // caller must fill in
 			r.Add = a.Offset
 		}
 		return 0
 	}

 	return a.Offset
 }

 func (ab *AsmBuf) asmandsz(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr, r int, rex int, m64 int) {
 	var base int
 	var rel obj.Reloc

 	rex &= 0x40 | Rxr
 	if a.Offset != int64(int32(a.Offset)) {
 		// The rules are slightly different for 386 and AMD64,
 		// mostly for historical reasons. We may unify them later,
 		// but it must be discussed beforehand.
 		//
 		// For 64bit mode only LEAL is allowed to overflow.
 		// It's how https://golang.org/cl/59630 made it.
 		// crypto/sha1/sha1block_amd64.s depends on this feature.
 		//
 		// For 32bit mode rules are more permissive.
 		// If offset fits uint32, it's permitted.
 		// This is allowed for assembly that wants to use 32-bit hex
 		// constants, e.g. LEAL 0x99999999(AX), AX.
 		overflowOK := (ctxt.Arch.Family == sys.AMD64 && p.As == ALEAL) ||
 			(ctxt.Arch.Family != sys.AMD64 &&
 				int64(uint32(a.Offset)) == a.Offset &&
 				ab.rexflag&Rxw == 0)
 		if !overflowOK {
 			ctxt.Diag("offset too large in %s", p)
 		}
 	}
 	v := int32(a.Offset)
 	rel.Siz = 0

 	switch a.Type {
 	case obj.TYPE_ADDR:
 		if a.Name == obj.NAME_NONE {
 			ctxt.Diag("unexpected TYPE_ADDR with NAME_NONE")
 		}
 		if a.Index == REG_TLS {
 			ctxt.Diag("unexpected TYPE_ADDR with index==REG_TLS")
 		}
 		goto bad

 	case obj.TYPE_REG:
 		const regFirst = REG_AL
 		const regLast = REG_Z31
 		if a.Reg < regFirst || regLast < a.Reg {
 			goto bad
 		}
 		if v != 0 {
 			goto bad
 		}
 		ab.Put1(byte(3<<6 | reg[a.Reg]<<0 | r<<3))
 		ab.rexflag |= regrex[a.Reg]&(0x40|Rxb) | rex
 		return
 	}

 	if a.Type != obj.TYPE_MEM {
 		goto bad
 	}

 	if a.Index != REG_NONE && a.Index != REG_TLS {
 		base := int(a.Reg)
 		switch a.Name {
 		case obj.NAME_EXTERN,
 			obj.NAME_GOTREF,
 			obj.NAME_STATIC:
 			if !useAbs(ctxt, a.Sym) && ctxt.Arch.Family == sys.AMD64 {
 				goto bad
 			}
 			if ctxt.Arch.Family == sys.I386 && ctxt.Flag_shared {
 				// The base register has already been set. It holds the PC
 				// of this instruction returned by a PC-reading thunk.
 				// See obj6.go:rewriteToPcrel.
 			} else {
 				base = REG_NONE
 			}
 			v = int32(vaddr(ctxt, p, a, &rel))

 		case obj.NAME_AUTO,
 			obj.NAME_PARAM:
 			base = REG_SP
 		}

 		ab.rexflag |= regrex[int(a.Index)]&Rxx | regrex[base]&Rxb | rex
 		if base == REG_NONE {
 			ab.Put1(byte(0<<6 | 4<<0 | r<<3))
 			ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
 			goto putrelv
 		}

 		if v == 0 && rel.Siz == 0 && base != REG_BP && base != REG_R13 {
 			ab.Put1(byte(0<<6 | 4<<0 | r<<3))
 			ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
 			return
 		}

 		if disp8, ok := toDisp8(v, p, ab); ok && rel.Siz == 0 {
 			ab.Put1(byte(1<<6 | 4<<0 | r<<3))
 			ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
 			ab.Put1(disp8)
 			return
 		}

 		ab.Put1(byte(2<<6 | 4<<0 | r<<3))
 		ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
 		goto putrelv
 	}

 	base = int(a.Reg)
 	switch a.Name {
 	case obj.NAME_STATIC,
 		obj.NAME_GOTREF,
 		obj.NAME_EXTERN:
 		if a.Sym == nil {
 			ctxt.Diag("bad addr: %v", p)
 		}
 		if ctxt.Arch.Family == sys.I386 && ctxt.Flag_shared {
 			// The base register has already been set. It holds the PC
 			// of this instruction returned by a PC-reading thunk.
 			// See obj6.go:rewriteToPcrel.
 		} else {
 			base = REG_NONE
 		}
 		v = int32(vaddr(ctxt, p, a, &rel))

 	case obj.NAME_AUTO,
 		obj.NAME_PARAM:
 		base = REG_SP
 	}

 	if base == REG_TLS {
 		v = int32(vaddr(ctxt, p, a, &rel))
 	}

 	ab.rexflag |= regrex[base]&Rxb | rex
 	if base == REG_NONE || (REG_CS <= base && base <= REG_GS) || base == REG_TLS {
 		if (a.Sym == nil || !useAbs(ctxt, a.Sym)) && base == REG_NONE && (a.Name == obj.NAME_STATIC || a.Name == obj.NAME_EXTERN || a.Name == obj.NAME_GOTREF) || ctxt.Arch.Family != sys.AMD64 {
 			if a.Name == obj.NAME_GOTREF && (a.Offset != 0 || a.Index != 0 || a.Scale != 0) {
 				ctxt.Diag("%v has offset against gotref", p)
 			}
 			ab.Put1(byte(0<<6 | 5<<0 | r<<3))
 			goto putrelv
 		}

 		// temporary
 		ab.Put2(
 			byte(0<<6|4<<0|r<<3), // sib present
 			0<<6|4<<3|5<<0,       // DS:d32
 		)
 		goto putrelv
 	}

 	if base == REG_SP || base == REG_R12 {
 		if v == 0 {
 			ab.Put1(byte(0<<6 | reg[base]<<0 | r<<3))
 			ab.asmidx(ctxt, int(a.Scale), REG_NONE, base)
 			return
 		}

 		if disp8, ok := toDisp8(v, p, ab); ok {
 			ab.Put1(byte(1<<6 | reg[base]<<0 | r<<3))
 			ab.asmidx(ctxt, int(a.Scale), REG_NONE, base)
 			ab.Put1(disp8)
 			return
 		}

 		ab.Put1(byte(2<<6 | reg[base]<<0 | r<<3))
 		ab.asmidx(ctxt, int(a.Scale), REG_NONE, base)
 		goto putrelv
 	}

 	if REG_AX <= base && base <= REG_R15 {
 		if a.Index == REG_TLS && !ctxt.Flag_shared && !isAndroid {
 			rel = obj.Reloc{}
 			rel.Type = objabi.R_TLS_LE
 			rel.Siz = 4
 			rel.Sym = nil
 			rel.Add = int64(v)
 			v = 0
 		}

 		if v == 0 && rel.Siz == 0 && base != REG_BP && base != REG_R13 {
 			ab.Put1(byte(0<<6 | reg[base]<<0 | r<<3))
 			return
 		}

 		if disp8, ok := toDisp8(v, p, ab); ok && rel.Siz == 0 {
 			ab.Put2(byte(1<<6|reg[base]<<0|r<<3), disp8)
 			return
 		}

 		ab.Put1(byte(2<<6 | reg[base]<<0 | r<<3))
 		goto putrelv
 	}

 	goto bad

 putrelv:
 	if rel.Siz != 0 {
 		if rel.Siz != 4 {
 			ctxt.Diag("bad rel")
 			goto bad
 		}

 		r := obj.Addrel(cursym)
 		*r = rel
 		r.Off = int32(p.Pc + int64(ab.Len()))
 	}

 	ab.PutInt32(v)
 	return

 bad:
 	ctxt.Diag("asmand: bad address %v", obj.Dconv(p, a))
 }

 func (ab *AsmBuf) asmand(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr, ra *obj.Addr) {
 	ab.asmandsz(ctxt, cursym, p, a, reg[ra.Reg], regrex[ra.Reg], 0)
 }

 func (ab *AsmBuf) asmando(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr, o int) {
 	ab.asmandsz(ctxt, cursym, p, a, o, 0, 0)
 }

 func bytereg(a *obj.Addr, t *uint8) {
 	if a.Type == obj.TYPE_REG && a.Index == REG_NONE && (REG_AX <= a.Reg && a.Reg <= REG_R15) {
 		a.Reg += REG_AL - REG_AX
 		*t = 0
 	}
 }

 func unbytereg(a *obj.Addr, t *uint8) {
 	if a.Type == obj.TYPE_REG && a.Index == REG_NONE && (REG_AL <= a.Reg && a.Reg <= REG_R15B) {
 		a.Reg += REG_AX - REG_AL
 		*t = 0
 	}
 }

 const (
 	movLit uint8 = iota // Like Zlit
 	movRegMem
 	movMemReg
 	movRegMem2op
 	movMemReg2op
 	movFullPtr // Load full pointer, trash heap (unsupported)
 	movDoubleShift
 	movTLSReg
 )

 var ymovtab = []movtab{
 	// push
 	{APUSHL, Ycs, Ynone, Ynone, movLit, [4]uint8{0x0e, 0}},
 	{APUSHL, Yss, Ynone, Ynone, movLit, [4]uint8{0x16, 0}},
 	{APUSHL, Yds, Ynone, Ynone, movLit, [4]uint8{0x1e, 0}},
 	{APUSHL, Yes, Ynone, Ynone, movLit, [4]uint8{0x06, 0}},
 	{APUSHL, Yfs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa0, 0}},
 	{APUSHL, Ygs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa8, 0}},
 	{APUSHQ, Yfs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa0, 0}},
 	{APUSHQ, Ygs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa8, 0}},
 	{APUSHW, Ycs, Ynone, Ynone, movLit, [4]uint8{Pe, 0x0e, 0}},
 	{APUSHW, Yss, Ynone, Ynone, movLit, [4]uint8{Pe, 0x16, 0}},
 	{APUSHW, Yds, Ynone, Ynone, movLit, [4]uint8{Pe, 0x1e, 0}},
 	{APUSHW, Yes, Ynone, Ynone, movLit, [4]uint8{Pe, 0x06, 0}},
 	{APUSHW, Yfs, Ynone, Ynone, movLit, [4]uint8{Pe, 0x0f, 0xa0, 0}},
 	{APUSHW, Ygs, Ynone, Ynone, movLit, [4]uint8{Pe, 0x0f, 0xa8, 0}},

 	// pop
 	{APOPL, Ynone, Ynone, Yds, movLit, [4]uint8{0x1f, 0}},
 	{APOPL, Ynone, Ynone, Yes, movLit, [4]uint8{0x07, 0}},
 	{APOPL, Ynone, Ynone, Yss, movLit, [4]uint8{0x17, 0}},
 	{APOPL, Ynone, Ynone, Yfs, movLit, [4]uint8{0x0f, 0xa1, 0}},
 	{APOPL, Ynone, Ynone, Ygs, movLit, [4]uint8{0x0f, 0xa9, 0}},
 	{APOPQ, Ynone, Ynone, Yfs, movLit, [4]uint8{0x0f, 0xa1, 0}},
 	{APOPQ, Ynone, Ynone, Ygs, movLit, [4]uint8{0x0f, 0xa9, 0}},
 	{APOPW, Ynone, Ynone, Yds, movLit, [4]uint8{Pe, 0x1f, 0}},
 	{APOPW, Ynone, Ynone, Yes, movLit, [4]uint8{Pe, 0x07, 0}},
 	{APOPW, Ynone, Ynone, Yss, movLit, [4]uint8{Pe, 0x17, 0}},
 	{APOPW, Ynone, Ynone, Yfs, movLit, [4]uint8{Pe, 0x0f, 0xa1, 0}},
 	{APOPW, Ynone, Ynone, Ygs, movLit, [4]uint8{Pe, 0x0f, 0xa9, 0}},

 	// mov seg
 	{AMOVW, Yes, Ynone, Yml, movRegMem, [4]uint8{0x8c, 0, 0, 0}},
 	{AMOVW, Ycs, Ynone, Yml, movRegMem, [4]uint8{0x8c, 1, 0, 0}},
 	{AMOVW, Yss, Ynone, Yml, movRegMem, [4]uint8{0x8c, 2, 0, 0}},
 	{AMOVW, Yds, Ynone, Yml, movRegMem, [4]uint8{0x8c, 3, 0, 0}},
 	{AMOVW, Yfs, Ynone, Yml, movRegMem, [4]uint8{0x8c, 4, 0, 0}},
 	{AMOVW, Ygs, Ynone, Yml, movRegMem, [4]uint8{0x8c, 5, 0, 0}},
 	{AMOVW, Yml, Ynone, Yes, movMemReg, [4]uint8{0x8e, 0, 0, 0}},
 	{AMOVW, Yml, Ynone, Ycs, movMemReg, [4]uint8{0x8e, 1, 0, 0}},
 	{AMOVW, Yml, Ynone, Yss, movMemReg, [4]uint8{0x8e, 2, 0, 0}},
 	{AMOVW, Yml, Ynone, Yds, movMemReg, [4]uint8{0x8e, 3, 0, 0}},
 	{AMOVW, Yml, Ynone, Yfs, movMemReg, [4]uint8{0x8e, 4, 0, 0}},
 	{AMOVW, Yml, Ynone, Ygs, movMemReg, [4]uint8{0x8e, 5, 0, 0}},

 	// mov cr
 	{AMOVL, Ycr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 0, 0}},
 	{AMOVL, Ycr2, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 2, 0}},
 	{AMOVL, Ycr3, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 3, 0}},
 	{AMOVL, Ycr4, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 4, 0}},
 	{AMOVL, Ycr8, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 8, 0}},
 	{AMOVQ, Ycr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 0, 0}},
 	{AMOVQ, Ycr2, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 2, 0}},
 	{AMOVQ, Ycr3, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 3, 0}},
 	{AMOVQ, Ycr4, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 4, 0}},
 	{AMOVQ, Ycr8, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 8, 0}},
 	{AMOVL, Yrl, Ynone, Ycr0, movMemReg2op, [4]uint8{0x0f, 0x22, 0, 0}},
 	{AMOVL, Yrl, Ynone, Ycr2, movMemReg2op, [4]uint8{0x0f, 0x22, 2, 0}},
 	{AMOVL, Yrl, Ynone, Ycr3, movMemReg2op, [4]uint8{0x0f, 0x22, 3, 0}},
 	{AMOVL, Yrl, Ynone, Ycr4, movMemReg2op, [4]uint8{0x0f, 0x22, 4, 0}},
 	{AMOVL, Yrl, Ynone, Ycr8, movMemReg2op, [4]uint8{0x0f, 0x22, 8, 0}},
 	{AMOVQ, Yrl, Ynone, Ycr0, movMemReg2op, [4]uint8{0x0f, 0x22, 0, 0}},
 	{AMOVQ, Yrl, Ynone, Ycr2, movMemReg2op, [4]uint8{0x0f, 0x22, 2, 0}},
 	{AMOVQ, Yrl, Ynone, Ycr3, movMemReg2op, [4]uint8{0x0f, 0x22, 3, 0}},
 	{AMOVQ, Yrl, Ynone, Ycr4, movMemReg2op, [4]uint8{0x0f, 0x22, 4, 0}},
 	{AMOVQ, Yrl, Ynone, Ycr8, movMemReg2op, [4]uint8{0x0f, 0x22, 8, 0}},

 	// mov dr
 	{AMOVL, Ydr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 0, 0}},
 	{AMOVL, Ydr6, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 6, 0}},
 	{AMOVL, Ydr7, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 7, 0}},
 	{AMOVQ, Ydr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 0, 0}},
 	{AMOVQ, Ydr2, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 2, 0}},
 	{AMOVQ, Ydr3, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 3, 0}},
 	{AMOVQ, Ydr6, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 6, 0}},
 	{AMOVQ, Ydr7, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 7, 0}},
 	{AMOVL, Yrl, Ynone, Ydr0, movMemReg2op, [4]uint8{0x0f, 0x23, 0, 0}},
 	{AMOVL, Yrl, Ynone, Ydr6, movMemReg2op, [4]uint8{0x0f, 0x23, 6, 0}},
 	{AMOVL, Yrl, Ynone, Ydr7, movMemReg2op, [4]uint8{0x0f, 0x23, 7, 0}},
 	{AMOVQ, Yrl, Ynone, Ydr0, movMemReg2op, [4]uint8{0x0f, 0x23, 0, 0}},
 	{AMOVQ, Yrl, Ynone, Ydr2, movMemReg2op, [4]uint8{0x0f, 0x23, 2, 0}},
 	{AMOVQ, Yrl, Ynone, Ydr3, movMemReg2op, [4]uint8{0x0f, 0x23, 3, 0}},
 	{AMOVQ, Yrl, Ynone, Ydr6, movMemReg2op, [4]uint8{0x0f, 0x23, 6, 0}},
 	{AMOVQ, Yrl, Ynone, Ydr7, movMemReg2op, [4]uint8{0x0f, 0x23, 7, 0}},

 	// mov tr
 	{AMOVL, Ytr6, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x24, 6, 0}},
 	{AMOVL, Ytr7, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x24, 7, 0}},
 	{AMOVL, Yml, Ynone, Ytr6, movMemReg2op, [4]uint8{0x0f, 0x26, 6, 0xff}},
 	{AMOVL, Yml, Ynone, Ytr7, movMemReg2op, [4]uint8{0x0f, 0x26, 7, 0xff}},

 	// lgdt, sgdt, lidt, sidt
 	{AMOVL, Ym, Ynone, Ygdtr, movMemReg2op, [4]uint8{0x0f, 0x01, 2, 0}},
 	{AMOVL, Ygdtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 0, 0}},
 	{AMOVL, Ym, Ynone, Yidtr, movMemReg2op, [4]uint8{0x0f, 0x01, 3, 0}},
 	{AMOVL, Yidtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 1, 0}},
 	{AMOVQ, Ym, Ynone, Ygdtr, movMemReg2op, [4]uint8{0x0f, 0x01, 2, 0}},
 	{AMOVQ, Ygdtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 0, 0}},
 	{AMOVQ, Ym, Ynone, Yidtr, movMemReg2op, [4]uint8{0x0f, 0x01, 3, 0}},
 	{AMOVQ, Yidtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 1, 0}},

 	// lldt, sldt
 	{AMOVW, Yml, Ynone, Yldtr, movMemReg2op, [4]uint8{0x0f, 0x00, 2, 0}},
 	{AMOVW, Yldtr, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x00, 0, 0}},

 	// lmsw, smsw
 	{AMOVW, Yml, Ynone, Ymsw, movMemReg2op, [4]uint8{0x0f, 0x01, 6, 0}},
 	{AMOVW, Ymsw, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x01, 4, 0}},

 	// ltr, str
 	{AMOVW, Yml, Ynone, Ytask, movMemReg2op, [4]uint8{0x0f, 0x00, 3, 0}},
 	{AMOVW, Ytask, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x00, 1, 0}},

 	/* load full pointer - unsupported
 	{AMOVL, Yml, Ycol, movFullPtr, [4]uint8{0, 0, 0, 0}},
 	{AMOVW, Yml, Ycol, movFullPtr, [4]uint8{Pe, 0, 0, 0}},
 	*/

 	// double shift
 	{ASHLL, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{0xa4, 0xa5, 0, 0}},
 	{ASHLL, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{0xa4, 0xa5, 0, 0}},
 	{ASHLL, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{0xa4, 0xa5, 0, 0}},
 	{ASHRL, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{0xac, 0xad, 0, 0}},
 	{ASHRL, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{0xac, 0xad, 0, 0}},
 	{ASHRL, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{0xac, 0xad, 0, 0}},
 	{ASHLQ, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xa4, 0xa5, 0}},
 	{ASHLQ, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xa4, 0xa5, 0}},
 	{ASHLQ, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xa4, 0xa5, 0}},
 	{ASHRQ, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xac, 0xad, 0}},
 	{ASHRQ, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xac, 0xad, 0}},
 	{ASHRQ, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xac, 0xad, 0}},
 	{ASHLW, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xa4, 0xa5, 0}},
 	{ASHLW, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xa4, 0xa5, 0}},
 	{ASHLW, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xa4, 0xa5, 0}},
 	{ASHRW, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xac, 0xad, 0}},
 	{ASHRW, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xac, 0xad, 0}},
 	{ASHRW, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xac, 0xad, 0}},

 	// load TLS base
 	{AMOVL, Ytls, Ynone, Yrl, movTLSReg, [4]uint8{0, 0, 0, 0}},
 	{AMOVQ, Ytls, Ynone, Yrl, movTLSReg, [4]uint8{0, 0, 0, 0}},
 	{0, 0, 0, 0, 0, [4]uint8{}},
 }

 func isax(a *obj.Addr) bool {
 	switch a.Reg {
 	case REG_AX, REG_AL, REG_AH:
 		return true
 	}

 	if a.Index == REG_AX {
 		return true
 	}
 	return false
 }

 func subreg(p *obj.Prog, from int, to int) {
 	if false { /* debug['Q'] */
 		fmt.Printf("\n%v\ts/%v/%v/\n", p, rconv(from), rconv(to))
 	}

 	if int(p.From.Reg) == from {
 		p.From.Reg = int16(to)
 		p.Ft = 0
 	}

 	if int(p.To.Reg) == from {
 		p.To.Reg = int16(to)
 		p.Tt = 0
 	}

 	if int(p.From.Index) == from {
 		p.From.Index = int16(to)
 		p.Ft = 0
 	}

 	if int(p.To.Index) == from {
 		p.To.Index = int16(to)
 		p.Tt = 0
 	}

 	if false { /* debug['Q'] */
 		fmt.Printf("%v\n", p)
 	}
 }

 func (ab *AsmBuf) mediaop(ctxt *obj.Link, o *Optab, op int, osize int, z int) int {
 	switch op {
 	case Pm, Pe, Pf2, Pf3:
 		if osize != 1 {
 			if op != Pm {
 				ab.Put1(byte(op))
 			}
 			ab.Put1(Pm)
 			z++
 			op = int(o.op[z])
 			break
 		}
 		fallthrough

 	default:
 		if ab.Len() == 0 || ab.Last() != Pm {
 			ab.Put1(Pm)
 		}
 	}

 	ab.Put1(byte(op))
 	return z
 }

 var bpduff1 = []byte{
 	0x48, 0x89, 0x6c, 0x24, 0xf0, // MOVQ BP, -16(SP)
 	0x48, 0x8d, 0x6c, 0x24, 0xf0, // LEAQ -16(SP), BP
 }

 var bpduff2 = []byte{
 	0x48, 0x8b, 0x6d, 0x00, // MOVQ 0(BP), BP
 }

 // asmevex emits EVEX pregis and opcode byte.
 // In addition to asmvex r/m, vvvv and reg fields also requires optional
 // K-masking register.
 //
 // Expects asmbuf.evex to be properly initialized.
 func (ab *AsmBuf) asmevex(ctxt *obj.Link, p *obj.Prog, rm, v, r, k *obj.Addr) {
 	ab.evexflag = true
 	evex := ab.evex

 	rexR := byte(1)
 	evexR := byte(1)
 	rexX := byte(1)
 	rexB := byte(1)
 	if r != nil {
 		if regrex[r.Reg]&Rxr != 0 {
 			rexR = 0 // "ModR/M.reg" selector 4th bit.
 		}
 		if regrex[r.Reg]&RxrEvex != 0 {
 			evexR = 0 // "ModR/M.reg" selector 5th bit.
 		}
 	}
 	if rm != nil {
 		if rm.Index == REG_NONE && regrex[rm.Reg]&RxrEvex != 0 {
 			rexX = 0
 		} else if regrex[rm.Index]&Rxx != 0 {
 			rexX = 0
 		}
 		if regrex[rm.Reg]&Rxb != 0 {
 			rexB = 0
 		}
 	}
 	// P0 = [R][X][B][R'][00][mm]
 	p0 := (rexR << 7) |
 		(rexX << 6) |
 		(rexB << 5) |
 		(evexR << 4) |
 		(0 << 2) |
 		(evex.M() << 0)

 	vexV := byte(0)
 	if v != nil {
 		// 4bit-wide reg index.
 		vexV = byte(reg[v.Reg]|(regrex[v.Reg]&Rxr)<<1) & 0xF
 	}
 	vexV ^= 0x0F
 	// P1 = [W][vvvv][1][pp]
 	p1 := (evex.W() << 7) |
 		(vexV << 3) |
 		(1 << 2) |
 		(evex.P() << 0)

 	suffix := evexSuffixMap[p.Scond]
 	evexZ := byte(0)
 	evexLL := evex.L()
 	evexB := byte(0)
 	evexV := byte(1)
 	evexA := byte(0)
 	if suffix.zeroing {
 		if !evex.ZeroingEnabled() {
 			ctxt.Diag("unsupported zeroing: %v", p)
 		}
 		evexZ = 1
 	}
 	switch {
 	case suffix.rounding != rcUnset:
 		if rm != nil && rm.Type == obj.TYPE_MEM {
 			ctxt.Diag("illegal rounding with memory argument: %v", p)
 		} else if !evex.RoundingEnabled() {
 			ctxt.Diag("unsupported rounding: %v", p)
 		}
 		evexB = 1
 		evexLL = suffix.rounding
 	case suffix.broadcast:
 		if rm == nil || rm.Type != obj.TYPE_MEM {
 			ctxt.Diag("illegal broadcast without memory argument: %v", p)
 		} else if !evex.BroadcastEnabled() {
 			ctxt.Diag("unsupported broadcast: %v", p)
 		}
 		evexB = 1
 	case suffix.sae:
 		if rm != nil && rm.Type == obj.TYPE_MEM {
 			ctxt.Diag("illegal SAE with memory argument: %v", p)
 		} else if !evex.SaeEnabled() {
 			ctxt.Diag("unsupported SAE: %v", p)
 		}
 		evexB = 1
 	}
 	if rm != nil && regrex[rm.Index]&RxrEvex != 0 {
 		evexV = 0
 	} else if v != nil && regrex[v.Reg]&RxrEvex != 0 {
 		evexV = 0 // VSR selector 5th bit.
 	}
 	if k != nil {
 		evexA = byte(reg[k.Reg])
 	}
 	// P2 = [z][L'L][b][V'][aaa]
 	p2 := (evexZ << 7) |
 		(evexLL << 5) |
 		(evexB << 4) |
 		(evexV << 3) |
 		(evexA << 0)

 	const evexEscapeByte = 0x62
 	ab.Put4(evexEscapeByte, p0, p1, p2)
 	ab.Put1(evex.opcode)
 }

 // Emit VEX prefix and opcode byte.
 // The three addresses are the r/m, vvvv, and reg fields.
 // The reg and rm arguments appear in the same order as the
 // arguments to asmand, which typically follows the call to asmvex.
 // The final two arguments are the VEX prefix (see encoding above)
 // and the opcode byte.
 // For details about vex prefix see:
 // https://en.wikipedia.org/wiki/VEX_prefix#Technical_description
 func (ab *AsmBuf) asmvex(ctxt *obj.Link, rm, v, r *obj.Addr, vex, opcode uint8) {
 	ab.vexflag = true
 	rexR := 0
 	if r != nil {
 		rexR = regrex[r.Reg] & Rxr
 	}
 	rexB := 0
 	rexX := 0
 	if rm != nil {
 		rexB = regrex[rm.Reg] & Rxb
 		rexX = regrex[rm.Index] & Rxx
 	}
 	vexM := (vex >> 3) & 0x7
 	vexWLP := vex & 0x87
 	vexV := byte(0)
 	if v != nil {
 		vexV = byte(reg[v.Reg]|(regrex[v.Reg]&Rxr)<<1) & 0xF
 	}
 	vexV ^= 0xF
 	if vexM == 1 && (rexX|rexB) == 0 && vex&vexW1 == 0 {
 		// Can use 2-byte encoding.
 		ab.Put2(0xc5, byte(rexR<<5)^0x80|vexV<<3|vexWLP)
 	} else {
 		// Must use 3-byte encoding.
 		ab.Put3(0xc4,
 			(byte(rexR|rexX|rexB)<<5)^0xE0|vexM,
 			vexV<<3|vexWLP,
 		)
 	}
 	ab.Put1(opcode)
 }

 // regIndex returns register index that fits in 5 bits.
 //
 //	R         : 3 bit | legacy instructions     | N/A
 //	[R/V]EX.R : 1 bit | REX / VEX extension bit | Rxr
 //	EVEX.R    : 1 bit | EVEX extension bit      | RxrEvex
 //
 // Examples:
 //	REG_Z30 => 30
 //	REG_X15 => 15
 //	REG_R9  => 9
 //	REG_AX  => 0
 //
 func regIndex(r int16) int {
 	lower3bits := reg[r]
 	high4bit := regrex[r] & Rxr << 1
 	high5bit := regrex[r] & RxrEvex << 0
 	return lower3bits | high4bit | high5bit
 }

 // avx2gatherValid reports whether p satisfies AVX2 gather constraints.
 // Reports errors via ctxt.
 func avx2gatherValid(ctxt *obj.Link, p *obj.Prog) bool {
 	// If any pair of the index, mask, or destination registers
 	// are the same, illegal instruction trap (#UD) is triggered.
 	index := regIndex(p.GetFrom3().Index)
 	mask := regIndex(p.From.Reg)
 	dest := regIndex(p.To.Reg)
 	if dest == mask || dest == index || mask == index {
 		ctxt.Diag("mask, index, and destination registers should be distinct: %v", p)
 		return false
 	}

 	return true
 }

 // avx512gatherValid reports whether p satisfies AVX512 gather constraints.
 // Reports errors via ctxt.
 func avx512gatherValid(ctxt *obj.Link, p *obj.Prog) bool {
 	// Illegal instruction trap (#UD) is triggered if the destination vector
 	// register is the same as index vector in VSIB.
 	index := regIndex(p.From.Index)
 	dest := regIndex(p.To.Reg)
 	if dest == index {
 		ctxt.Diag("index and destination registers should be distinct: %v", p)
 		return false
 	}

 	return true
 }

 func (ab *AsmBuf) doasm(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
 	o := opindex[p.As&obj.AMask]

 	if o == nil {
 		ctxt.Diag("asmins: missing op %v", p)
 		return
 	}

 	if pre := prefixof(ctxt, &p.From); pre != 0 {
 		ab.Put1(byte(pre))
 	}
 	if pre := prefixof(ctxt, &p.To); pre != 0 {
 		ab.Put1(byte(pre))
 	}

 	// Checks to warn about instruction/arguments combinations that
 	// will unconditionally trigger illegal instruction trap (#UD).
 	switch p.As {
 	case AVGATHERDPD,
 		AVGATHERQPD,
 		AVGATHERDPS,
 		AVGATHERQPS,
 		AVPGATHERDD,
 		AVPGATHERQD,
 		AVPGATHERDQ,
 		AVPGATHERQQ:
 		// AVX512 gather requires explicit K mask.
 		if p.GetFrom3().Reg >= REG_K0 && p.GetFrom3().Reg <= REG_K7 {
 			if !avx512gatherValid(ctxt, p) {
 				return
 			}
 		} else {
 			if !avx2gatherValid(ctxt, p) {
 				return
 			}
 		}
 	}

 	if p.Ft == 0 {
 		p.Ft = uint8(oclass(ctxt, p, &p.From))
 	}
 	if p.Tt == 0 {
 		p.Tt = uint8(oclass(ctxt, p, &p.To))
 	}

 	ft := int(p.Ft) * Ymax
 	var f3t int
 	tt := int(p.Tt) * Ymax

 	xo := obj.Bool2int(o.op[0] == 0x0f)
 	z := 0
 	var a *obj.Addr
 	var l int
 	var op int
 	var q *obj.Prog
 	var r *obj.Reloc
 	var rel obj.Reloc
 	var v int64

 	args := make([]int, 0, argListMax)
 	if ft != Ynone*Ymax {
 		args = append(args, ft)
 	}
 	for i := range p.RestArgs {
 		args = append(args, oclass(ctxt, p, &p.RestArgs[i].Addr)*Ymax)
 	}
 	if tt != Ynone*Ymax {
 		args = append(args, tt)
 	}

 	for _, yt := range o.ytab {
 		// ytab matching is purely args-based,
 		// but AVX512 suffixes like "Z" or "RU_SAE" will
 		// add EVEX-only filter that will reject non-EVEX matches.
 		//
 		// Consider "VADDPD.BCST 2032(DX), X0, X0".
 		// Without this rule, operands will lead to VEX-encoded form
 		// and produce "c5b15813" encoding.
 		if !yt.match(args) {
 			// "xo" is always zero for VEX/EVEX encoded insts.
 			z += int(yt.zoffset) + xo
 		} else {
 			if p.Scond != 0 && !evexZcase(yt.zcase) {
 				// Do not signal error and continue to search
 				// for matching EVEX-encoded form.
 				z += int(yt.zoffset)
 				continue
 			}

 			switch o.prefix {
 			case Px1: // first option valid only in 32-bit mode
 				if ctxt.Arch.Family == sys.AMD64 && z == 0 {
 					z += int(yt.zoffset) + xo
 					continue
 				}
 			case Pq: // 16 bit escape and opcode escape
 				ab.Put2(Pe, Pm)

 			case Pq3: // 16 bit escape and opcode escape + REX.W
 				ab.rexflag |= Pw
 				ab.Put2(Pe, Pm)

 			case Pq4: // 66 0F 38
 				ab.Put3(0x66, 0x0F, 0x38)

 			case Pq4w: // 66 0F 38 + REX.W
 				ab.rexflag |= Pw
 				ab.Put3(0x66, 0x0F, 0x38)

 			case Pq5: // F3 0F 38
 				ab.Put3(0xF3, 0x0F, 0x38)

 			case Pq5w: //  F3 0F 38 + REX.W
 				ab.rexflag |= Pw
 				ab.Put3(0xF3, 0x0F, 0x38)

 			case Pf2, // xmm opcode escape
 				Pf3:
 				ab.Put2(o.prefix, Pm)

 			case Pef3:
 				ab.Put3(Pe, Pf3, Pm)

 			case Pfw: // xmm opcode escape + REX.W
 				ab.rexflag |= Pw
 				ab.Put2(Pf3, Pm)

 			case Pm: // opcode escape
 				ab.Put1(Pm)

 			case Pe: // 16 bit escape
 				ab.Put1(Pe)

 			case Pw: // 64-bit escape
 				if ctxt.Arch.Family != sys.AMD64 {
 					ctxt.Diag("asmins: illegal 64: %v", p)
 				}
 				ab.rexflag |= Pw

 			case Pw8: // 64-bit escape if z >= 8
 				if z >= 8 {
 					if ctxt.Arch.Family != sys.AMD64 {
 						ctxt.Diag("asmins: illegal 64: %v", p)
 					}
 					ab.rexflag |= Pw
 				}

 			case Pb: // botch
 				if ctxt.Arch.Family != sys.AMD64 && (isbadbyte(&p.From) || isbadbyte(&p.To)) {
 					goto bad
 				}
 				// NOTE(rsc): This is probably safe to do always,
 				// but when enabled it chooses different encodings
 				// than the old cmd/internal/obj/i386 code did,
 				// which breaks our "same bits out" checks.
 				// In particular, CMPB AX, $0 encodes as 80 f8 00
 				// in the original obj/i386, and it would encode
 				// (using a valid, shorter form) as 3c 00 if we enabled
 				// the call to bytereg here.
 				if ctxt.Arch.Family == sys.AMD64 {
 					bytereg(&p.From, &p.Ft)
 					bytereg(&p.To, &p.Tt)
 				}

 			case P32: // 32 bit but illegal if 64-bit mode
 				if ctxt.Arch.Family == sys.AMD64 {
 					ctxt.Diag("asmins: illegal in 64-bit mode: %v", p)
 				}

 			case Py: // 64-bit only, no prefix
 				if ctxt.Arch.Family != sys.AMD64 {
 					ctxt.Diag("asmins: illegal in %d-bit mode: %v", ctxt.Arch.RegSize*8, p)
 				}

 			case Py1: // 64-bit only if z < 1, no prefix
 				if z < 1 && ctxt.Arch.Family != sys.AMD64 {
 					ctxt.Diag("asmins: illegal in %d-bit mode: %v", ctxt.Arch.RegSize*8, p)
 				}

 			case Py3: // 64-bit only if z < 3, no prefix
 				if z < 3 && ctxt.Arch.Family != sys.AMD64 {
 					ctxt.Diag("asmins: illegal in %d-bit mode: %v", ctxt.Arch.RegSize*8, p)
 				}
 			}

 			if z >= len(o.op) {
 				log.Fatalf("asmins bad table %v", p)
 			}
 			op = int(o.op[z])
 			if op == 0x0f {
 				ab.Put1(byte(op))
 				z++
 				op = int(o.op[z])
 			}

 			switch yt.zcase {
 			default:
 				ctxt.Diag("asmins: unknown z %d %v", yt.zcase, p)
 				return

 			case Zpseudo:
 				break

 			case Zlit:
 				ab.PutOpBytesLit(z, &o.op)

 			case Zlitr_m:
 				ab.PutOpBytesLit(z, &o.op)
 				ab.asmand(ctxt, cursym, p, &p.To, &p.From)

 			case Zlitm_r:
 				ab.PutOpBytesLit(z, &o.op)
 				ab.asmand(ctxt, cursym, p, &p.From, &p.To)

 			case Zlit_m_r:
 				ab.PutOpBytesLit(z, &o.op)
 				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)

 			case Zmb_r:
 				bytereg(&p.From, &p.Ft)
 				fallthrough

 			case Zm_r:
 				ab.Put1(byte(op))
 				ab.asmand(ctxt, cursym, p, &p.From, &p.To)

 			case Z_m_r:
 				ab.Put1(byte(op))
 				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)

 			case Zm2_r:
 				ab.Put2(byte(op), o.op[z+1])
 				ab.asmand(ctxt, cursym, p, &p.From, &p.To)

 			case Zm_r_xm:
 				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
 				ab.asmand(ctxt, cursym, p, &p.From, &p.To)

 			case Zm_r_xm_nr:
 				ab.rexflag = 0
 				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
 				ab.asmand(ctxt, cursym, p, &p.From, &p.To)

 			case Zm_r_i_xm:
 				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
 				ab.asmand(ctxt, cursym, p, &p.From, p.GetFrom3())
 				ab.Put1(byte(p.To.Offset))

 			case Zibm_r, Zibr_m:
 				ab.PutOpBytesLit(z, &o.op)
 				if yt.zcase == Zibr_m {
 					ab.asmand(ctxt, cursym, p, &p.To, p.GetFrom3())
 				} else {
 					ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
 				}
 				switch {
 				default:
 					ab.Put1(byte(p.From.Offset))
 				case yt.args[0] == Yi32 && o.prefix == Pe:
 					ab.PutInt16(int16(p.From.Offset))
 				case yt.args[0] == Yi32:
 					ab.PutInt32(int32(p.From.Offset))
 				}

 			case Zaut_r:
 				ab.Put1(0x8d) // leal
 				if p.From.Type != obj.TYPE_ADDR {
 					ctxt.Diag("asmins: Zaut sb type ADDR")
 				}
 				p.From.Type = obj.TYPE_MEM
 				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
 				p.From.Type = obj.TYPE_ADDR

 			case Zm_o:
 				ab.Put1(byte(op))
 				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+1]))

 			case Zr_m:
 				ab.Put1(byte(op))
 				ab.asmand(ctxt, cursym, p, &p.To, &p.From)

 			case Zvex:
 				ab.asmvex(ctxt, &p.From, p.GetFrom3(), &p.To, o.op[z], o.op[z+1])

 			case Zvex_rm_v_r:
 				ab.asmvex(ctxt, &p.From, p.GetFrom3(), &p.To, o.op[z], o.op[z+1])
 				ab.asmand(ctxt, cursym, p, &p.From, &p.To)

 			case Zvex_rm_v_ro:
 				ab.asmvex(ctxt, &p.From, p.GetFrom3(), &p.To, o.op[z], o.op[z+1])
 				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+2]))

 			case Zvex_i_rm_vo:
 				ab.asmvex(ctxt, p.GetFrom3(), &p.To, nil, o.op[z], o.op[z+1])
 				ab.asmando(ctxt, cursym, p, p.GetFrom3(), int(o.op[z+2]))
 				ab.Put1(byte(p.From.Offset))

 			case Zvex_i_r_v:
 				ab.asmvex(ctxt, p.GetFrom3(), &p.To, nil, o.op[z], o.op[z+1])
 				regnum := byte(0x7)
 				if p.GetFrom3().Reg >= REG_X0 && p.GetFrom3().Reg <= REG_X15 {
 					regnum &= byte(p.GetFrom3().Reg - REG_X0)
 				} else {
 					regnum &= byte(p.GetFrom3().Reg - REG_Y0)
 				}
 				ab.Put1(o.op[z+2] | regnum)
 				ab.Put1(byte(p.From.Offset))

 			case Zvex_i_rm_v_r:
 				imm, from, from3, to := unpackOps4(p)
 				ab.asmvex(ctxt, from, from3, to, o.op[z], o.op[z+1])
 				ab.asmand(ctxt, cursym, p, from, to)
 				ab.Put1(byte(imm.Offset))

 			case Zvex_i_rm_r:
 				ab.asmvex(ctxt, p.GetFrom3(), nil, &p.To, o.op[z], o.op[z+1])
 				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
 				ab.Put1(byte(p.From.Offset))

 			case Zvex_v_rm_r:
 				ab.asmvex(ctxt, p.GetFrom3(), &p.From, &p.To, o.op[z], o.op[z+1])
 				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)

 			case Zvex_r_v_rm:
 				ab.asmvex(ctxt, &p.To, p.GetFrom3(), &p.From, o.op[z], o.op[z+1])
 				ab.asmand(ctxt, cursym, p, &p.To, &p.From)

 			case Zvex_rm_r_vo:
 				ab.asmvex(ctxt, &p.From, &p.To, p.GetFrom3(), o.op[z], o.op[z+1])
 				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+2]))

 			case Zvex_i_r_rm:
 				ab.asmvex(ctxt, &p.To, nil, p.GetFrom3(), o.op[z], o.op[z+1])
 				ab.asmand(ctxt, cursym, p, &p.To, p.GetFrom3())
 				ab.Put1(byte(p.From.Offset))

 			case Zvex_hr_rm_v_r:
 				hr, from, from3, to := unpackOps4(p)
 				ab.asmvex(ctxt, from, from3, to, o.op[z], o.op[z+1])
 				ab.asmand(ctxt, cursym, p, from, to)
 				ab.Put1(byte(regIndex(hr.Reg) << 4))

 			case Zevex_k_rmo:
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, &p.To, nil, nil, &p.From)
 				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+3]))

 			case Zevex_i_rm_vo:
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, p.GetFrom3(), &p.To, nil, nil)
 				ab.asmando(ctxt, cursym, p, p.GetFrom3(), int(o.op[z+3]))
 				ab.Put1(byte(p.From.Offset))

 			case Zevex_i_rm_k_vo:
 				imm, from, kmask, to := unpackOps4(p)
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, from, to, nil, kmask)
 				ab.asmando(ctxt, cursym, p, from, int(o.op[z+3]))
 				ab.Put1(byte(imm.Offset))

 			case Zevex_i_r_rm:
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, &p.To, nil, p.GetFrom3(), nil)
 				ab.asmand(ctxt, cursym, p, &p.To, p.GetFrom3())
 				ab.Put1(byte(p.From.Offset))

 			case Zevex_i_r_k_rm:
 				imm, from, kmask, to := unpackOps4(p)
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, to, nil, from, kmask)
 				ab.asmand(ctxt, cursym, p, to, from)
 				ab.Put1(byte(imm.Offset))

 			case Zevex_i_rm_r:
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, p.GetFrom3(), nil, &p.To, nil)
 				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
 				ab.Put1(byte(p.From.Offset))

 			case Zevex_i_rm_k_r:
 				imm, from, kmask, to := unpackOps4(p)
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, from, nil, to, kmask)
 				ab.asmand(ctxt, cursym, p, from, to)
 				ab.Put1(byte(imm.Offset))

 			case Zevex_i_rm_v_r:
 				imm, from, from3, to := unpackOps4(p)
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, from, from3, to, nil)
 				ab.asmand(ctxt, cursym, p, from, to)
 				ab.Put1(byte(imm.Offset))

 			case Zevex_i_rm_v_k_r:
 				imm, from, from3, kmask, to := unpackOps5(p)
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, from, from3, to, kmask)
 				ab.asmand(ctxt, cursym, p, from, to)
 				ab.Put1(byte(imm.Offset))

 			case Zevex_r_v_rm:
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, &p.To, p.GetFrom3(), &p.From, nil)
 				ab.asmand(ctxt, cursym, p, &p.To, &p.From)

 			case Zevex_rm_v_r:
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, &p.From, p.GetFrom3(), &p.To, nil)
 				ab.asmand(ctxt, cursym, p, &p.From, &p.To)

 			case Zevex_rm_k_r:
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, &p.From, nil, &p.To, p.GetFrom3())
 				ab.asmand(ctxt, cursym, p, &p.From, &p.To)

 			case Zevex_r_k_rm:
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, &p.To, nil, &p.From, p.GetFrom3())
 				ab.asmand(ctxt, cursym, p, &p.To, &p.From)

 			case Zevex_rm_v_k_r:
 				from, from3, kmask, to := unpackOps4(p)
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, from, from3, to, kmask)
 				ab.asmand(ctxt, cursym, p, from, to)

 			case Zevex_r_v_k_rm:
 				from, from3, kmask, to := unpackOps4(p)
 				ab.evex = newEVEXBits(z, &o.op)
 				ab.asmevex(ctxt, p, to, from3, from, kmask)
 				ab.asmand(ctxt, cursym, p, to, from)

 			case Zr_m_xm:
 				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
 				ab.asmand(ctxt, cursym, p, &p.To, &p.From)

 			case Zr_m_xm_nr:
 				ab.rexflag = 0
 				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
 				ab.asmand(ctxt, cursym, p, &p.To, &p.From)

 			case Zo_m:
 				ab.Put1(byte(op))
 				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))

 			case Zcallindreg:
 				r = obj.Addrel(cursym)
 				r.Off = int32(p.Pc)
 				r.Type = objabi.R_CALLIND
 				r.Siz = 0
 				fallthrough

 			case Zo_m64:
 				ab.Put1(byte(op))
 				ab.asmandsz(ctxt, cursym, p, &p.To, int(o.op[z+1]), 0, 1)

 			case Zm_ibo:
 				ab.Put1(byte(op))
 				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+1]))
 				ab.Put1(byte(vaddr(ctxt, p, &p.To, nil)))

 			case Zibo_m:
 				ab.Put1(byte(op))
 				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
 				ab.Put1(byte(vaddr(ctxt, p, &p.From, nil)))

 			case Zibo_m_xm:
 				z = ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
 				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
 				ab.Put1(byte(vaddr(ctxt, p, &p.From, nil)))

 			case Z_ib, Zib_:
 				if yt.zcase == Zib_ {
 					a = &p.From
 				} else {
 					a = &p.To
 				}
 				ab.Put1(byte(op))
 				if p.As == AXABORT {
 					ab.Put1(o.op[z+1])
 				}
 				ab.Put1(byte(vaddr(ctxt, p, a, nil)))

 			case Zib_rp:
 				ab.rexflag |= regrex[p.To.Reg] & (Rxb | 0x40)
 				ab.Put2(byte(op+reg[p.To.Reg]), byte(vaddr(ctxt, p, &p.From, nil)))

 			case Zil_rp:
 				ab.rexflag |= regrex[p.To.Reg] & Rxb
 				ab.Put1(byte(op + reg[p.To.Reg]))
 				if o.prefix == Pe {
 					v = vaddr(ctxt, p, &p.From, nil)
 					ab.PutInt16(int16(v))
 				} else {
 					ab.relput4(ctxt, cursym, p, &p.From)
 				}

 			case Zo_iw:
 				ab.Put1(byte(op))
 				if p.From.Type != obj.TYPE_NONE {
 					v = vaddr(ctxt, p, &p.From, nil)
 					ab.PutInt16(int16(v))
 				}

 			case Ziq_rp:
 				v = vaddr(ctxt, p, &p.From, &rel)
 				l = int(v >> 32)
 				if l == 0 && rel.Siz != 8 {
 					ab.rexflag &^= (0x40 | Rxw)

 					ab.rexflag |= regrex[p.To.Reg] & Rxb
 					ab.Put1(byte(0xb8 + reg[p.To.Reg]))
 					if rel.Type != 0 {
 						r = obj.Addrel(cursym)
 						*r = rel
 						r.Off = int32(p.Pc + int64(ab.Len()))
 					}

 					ab.PutInt32(int32(v))
 				} else if l == -1 && uint64(v)&(uint64(1)<<31) != 0 { // sign extend
 					ab.Put1(0xc7)
 					ab.asmando(ctxt, cursym, p, &p.To, 0)

 					ab.PutInt32(int32(v)) // need all 8
 				} else {
 					ab.rexflag |= regrex[p.To.Reg] & Rxb
 					ab.Put1(byte(op + reg[p.To.Reg]))
 					if rel.Type != 0 {
 						r = obj.Addrel(cursym)
 						*r = rel
 						r.Off = int32(p.Pc + int64(ab.Len()))
 					}

 					ab.PutInt64(v)
 				}

 			case Zib_rr:
 				ab.Put1(byte(op))
 				ab.asmand(ctxt, cursym, p, &p.To, &p.To)
 				ab.Put1(byte(vaddr(ctxt, p, &p.From, nil)))

 			case Z_il, Zil_:
 				if yt.zcase == Zil_ {
 					a = &p.From
 				} else {
 					a = &p.To
 				}
 				ab.Put1(byte(op))
 				if o.prefix == Pe {
 					v = vaddr(ctxt, p, a, nil)
 					ab.PutInt16(int16(v))
 				} else {
 					ab.relput4(ctxt, cursym, p, a)
 				}

 			case Zm_ilo, Zilo_m:
 				ab.Put1(byte(op))
 				if yt.zcase == Zilo_m {
 					a = &p.From
 					ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
 				} else {
 					a = &p.To
 					ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+1]))
 				}

 				if o.prefix == Pe {
 					v = vaddr(ctxt, p, a, nil)
 					ab.PutInt16(int16(v))
 				} else {
 					ab.relput4(ctxt, cursym, p, a)
 				}

 			case Zil_rr:
 				ab.Put1(byte(op))
 				ab.asmand(ctxt, cursym, p, &p.To, &p.To)
 				if o.prefix == Pe {
 					v = vaddr(ctxt, p, &p.From, nil)
 					ab.PutInt16(int16(v))
 				} else {
 					ab.relput4(ctxt, cursym, p, &p.From)
 				}

 			case Z_rp:
 				ab.rexflag |= regrex[p.To.Reg] & (Rxb | 0x40)
 				ab.Put1(byte(op + reg[p.To.Reg]))

 			case Zrp_:
 				ab.rexflag |= regrex[p.From.Reg] & (Rxb | 0x40)
 				ab.Put1(byte(op + reg[p.From.Reg]))

 			case Zcallcon, Zjmpcon:
 				if yt.zcase == Zcallcon {
 					ab.Put1(byte(op))
 				} else {
 					ab.Put1(o.op[z+1])
 				}
 				r = obj.Addrel(cursym)
 				r.Off = int32(p.Pc + int64(ab.Len()))
 				r.Type = objabi.R_PCREL
 				r.Siz = 4
 				r.Add = p.To.Offset
 				ab.PutInt32(0)

 			case Zcallind:
 				ab.Put2(byte(op), o.op[z+1])
 				r = obj.Addrel(cursym)
 				r.Off = int32(p.Pc + int64(ab.Len()))
 				if ctxt.Arch.Family == sys.AMD64 {
 					r.Type = objabi.R_PCREL
 				} else {
 					r.Type = objabi.R_ADDR
 				}
 				r.Siz = 4
 				r.Add = p.To.Offset
 				r.Sym = p.To.Sym
 				ab.PutInt32(0)

 			case Zcall, Zcallduff:
 				if p.To.Sym == nil {
 					ctxt.Diag("call without target")
 					ctxt.DiagFlush()
 					log.Fatalf("bad code")
 				}

 				if yt.zcase == Zcallduff && ctxt.Flag_dynlink {
 					ctxt.Diag("directly calling duff when dynamically linking Go")
 				}

 				if yt.zcase == Zcallduff && ctxt.Arch.Family == sys.AMD64 {
 					// Maintain BP around call, since duffcopy/duffzero can't do it
 					// (the call jumps into the middle of the function).
 					// This makes it possible to see call sites for duffcopy/duffzero in
 					// BP-based profiling tools like Linux perf (which is the
 					// whole point of maintaining frame pointers in Go).
 					// MOVQ BP, -16(SP)
 					// LEAQ -16(SP), BP
 					ab.Put(bpduff1)
 				}
 				ab.Put1(byte(op))
 				r = obj.Addrel(cursym)
 				r.Off = int32(p.Pc + int64(ab.Len()))
 				r.Sym = p.To.Sym
 				r.Add = p.To.Offset
 				r.Type = objabi.R_CALL
 				r.Siz = 4
 				ab.PutInt32(0)

 				if yt.zcase == Zcallduff && ctxt.Arch.Family == sys.AMD64 {
 					// Pop BP pushed above.
 					// MOVQ 0(BP), BP
 					ab.Put(bpduff2)
 				}

 			// TODO: jump across functions needs reloc
 			case Zbr, Zjmp, Zloop:
 				if p.As == AXBEGIN {
 					ab.Put1(byte(op))
 				}
 				if p.To.Sym != nil {
 					if yt.zcase != Zjmp {
 						ctxt.Diag("branch to ATEXT")
 						ctxt.DiagFlush()
 						log.Fatalf("bad code")
 					}

 					ab.Put1(o.op[z+1])
 					r = obj.Addrel(cursym)
 					r.Off = int32(p.Pc + int64(ab.Len()))
 					r.Sym = p.To.Sym
 					// Note: R_CALL instead of R_PCREL. R_CALL is more permissive in that
 					// it can point to a trampoline instead of the destination itself.
 					r.Type = objabi.R_CALL
 					r.Siz = 4
 					ab.PutInt32(0)
 					break
 				}

 				// Assumes q is in this function.
 				// TODO: Check in input, preserve in brchain.

 				// Fill in backward jump now.
 				q = p.To.Target()

 				if q == nil {
 					ctxt.Diag("jmp/branch/loop without target")
 					ctxt.DiagFlush()
 					log.Fatalf("bad code")
 				}

 				if p.Back&branchBackwards != 0 {
 					v = q.Pc - (p.Pc + 2)
 					if v >= -128 && p.As != AXBEGIN {
 						if p.As == AJCXZL {
 							ab.Put1(0x67)
 						}
 						ab.Put2(byte(op), byte(v))
 					} else if yt.zcase == Zloop {
 						ctxt.Diag("loop too far: %v", p)
 					} else {
 						v -= 5 - 2
 						if p.As == AXBEGIN {
 							v--
 						}
 						if yt.zcase == Zbr {
 							ab.Put1(0x0f)
 							v--
 						}

 						ab.Put1(o.op[z+1])
 						ab.PutInt32(int32(v))
 					}

 					break
 				}

 				// Annotate target; will fill in later.
 				p.Forwd = q.Rel

 				q.Rel = p
 				if p.Back&branchShort != 0 && p.As != AXBEGIN {
 					if p.As == AJCXZL {
 						ab.Put1(0x67)
 					}
 					ab.Put2(byte(op), 0)
 				} else if yt.zcase == Zloop {
 					ctxt.Diag("loop too far: %v", p)
 				} else {
 					if yt.zcase == Zbr {
 						ab.Put1(0x0f)
 					}
 					ab.Put1(o.op[z+1])
 					ab.PutInt32(0)
 				}

 			case Zbyte:
 				v = vaddr(ctxt, p, &p.From, &rel)
 				if rel.Siz != 0 {
 					rel.Siz = uint8(op)
 					r = obj.Addrel(cursym)
 					*r = rel
 					r.Off = int32(p.Pc + int64(ab.Len()))
 				}

 				ab.Put1(byte(v))
 				if op > 1 {
 					ab.Put1(byte(v >> 8))
 					if op > 2 {
 						ab.PutInt16(int16(v >> 16))
 						if op > 4 {
 							ab.PutInt32(int32(v >> 32))
 						}
 					}
 				}
 			}

 			return
 		}
 	}
 	f3t = Ynone * Ymax
 	if p.GetFrom3() != nil {
 		f3t = oclass(ctxt, p, p.GetFrom3()) * Ymax
 	}
 	for mo := ymovtab; mo[0].as != 0; mo = mo[1:] {
 		var pp obj.Prog
 		var t []byte
 		if p.As == mo[0].as {
 			if ycover[ft+int(mo[0].ft)] != 0 && ycover[f3t+int(mo[0].f3t)] != 0 && ycover[tt+int(mo[0].tt)] != 0 {
 				t = mo[0].op[:]
 				switch mo[0].code {
 				default:
 					ctxt.Diag("asmins: unknown mov %d %v", mo[0].code, p)

 				case movLit:
 					for z = 0; t[z] != 0; z++ {
 						ab.Put1(t[z])
 					}

 				case movRegMem:
 					ab.Put1(t[0])
 					ab.asmando(ctxt, cursym, p, &p.To, int(t[1]))

 				case movMemReg:
 					ab.Put1(t[0])
 					ab.asmando(ctxt, cursym, p, &p.From, int(t[1]))

 				case movRegMem2op: // r,m - 2op
 					ab.Put2(t[0], t[1])
 					ab.asmando(ctxt, cursym, p, &p.To, int(t[2]))
 					ab.rexflag |= regrex[p.From.Reg] & (Rxr | 0x40)

 				case movMemReg2op:
 					ab.Put2(t[0], t[1])
 					ab.asmando(ctxt, cursym, p, &p.From, int(t[2]))
 					ab.rexflag |= regrex[p.To.Reg] & (Rxr | 0x40)

 				case movFullPtr:
 					if t[0] != 0 {
 						ab.Put1(t[0])
 					}
 					switch p.To.Index {
 					default:
 						goto bad

 					case REG_DS:
 						ab.Put1(0xc5)

 					case REG_SS:
 						ab.Put2(0x0f, 0xb2)

 					case REG_ES:
 						ab.Put1(0xc4)

 					case REG_FS:
 						ab.Put2(0x0f, 0xb4)

 					case REG_GS:
 						ab.Put2(0x0f, 0xb5)
 					}

 					ab.asmand(ctxt, cursym, p, &p.From, &p.To)

 				case movDoubleShift:
 					if t[0] == Pw {
 						if ctxt.Arch.Family != sys.AMD64 {
 							ctxt.Diag("asmins: illegal 64: %v", p)
 						}
 						ab.rexflag |= Pw
 						t = t[1:]
 					} else if t[0] == Pe {
 						ab.Put1(Pe)
 						t = t[1:]
 					}

 					switch p.From.Type {
 					default:
 						goto bad

 					case obj.TYPE_CONST:
 						ab.Put2(0x0f, t[0])
 						ab.asmandsz(ctxt, cursym, p, &p.To, reg[p.GetFrom3().Reg], regrex[p.GetFrom3().Reg], 0)
 						ab.Put1(byte(p.From.Offset))

 					case obj.TYPE_REG:
 						switch p.From.Reg {
 						default:
 							goto bad

 						case REG_CL, REG_CX:
 							ab.Put2(0x0f, t[1])
 							ab.asmandsz(ctxt, cursym, p, &p.To, reg[p.GetFrom3().Reg], regrex[p.GetFrom3().Reg], 0)
 						}
 					}

 				// NOTE: The systems listed here are the ones that use the "TLS initial exec" model,
 				// where you load the TLS base register into a register and then index off that
 				// register to access the actual TLS variables. Systems that allow direct TLS access
 				// are handled in prefixof above and should not be listed here.
 				case movTLSReg:
 					if ctxt.Arch.Family == sys.AMD64 && p.As != AMOVQ || ctxt.Arch.Family == sys.I386 && p.As != AMOVL {
 						ctxt.Diag("invalid load of TLS: %v", p)
 					}

 					if ctxt.Arch.Family == sys.I386 {
 						// NOTE: The systems listed here are the ones that use the "TLS initial exec" model,
 						// where you load the TLS base register into a register and then index off that
 						// register to access the actual TLS variables. Systems that allow direct TLS access
 						// are handled in prefixof above and should not be listed here.
 						switch ctxt.Headtype {
 						default:
 							log.Fatalf("unknown TLS base location for %v", ctxt.Headtype)

 						case objabi.Hlinux, objabi.Hfreebsd:
 							if ctxt.Flag_shared {
 								// Note that this is not generating the same insns as the other cases.
 								//     MOV TLS, dst
 								// becomes
 								//     call __x86.get_pc_thunk.dst
 								//     movl (gotpc + g@gotntpoff)(dst), dst
 								// which is encoded as
 								//     call __x86.get_pc_thunk.dst
 								//     movq 0(dst), dst
 								// and R_CALL & R_TLS_IE relocs. This all assumes the only tls variable we access
 								// is g, which we can't check here, but will when we assemble the second
 								// instruction.
 								dst := p.To.Reg
 								ab.Put1(0xe8)
 								r = obj.Addrel(cursym)
 								r.Off = int32(p.Pc + int64(ab.Len()))
 								r.Type = objabi.R_CALL
 								r.Siz = 4
 								r.Sym = ctxt.Lookup("__x86.get_pc_thunk." + strings.ToLower(rconv(int(dst))))
 								ab.PutInt32(0)

 								ab.Put2(0x8B, byte(2<<6|reg[dst]|(reg[dst]<<3)))
 								r = obj.Addrel(cursym)
 								r.Off = int32(p.Pc + int64(ab.Len()))
 								r.Type = objabi.R_TLS_IE
 								r.Siz = 4
 								r.Add = 2
 								ab.PutInt32(0)
 							} else {
 								// ELF TLS base is 0(GS).
 								pp.From = p.From

 								pp.From.Type = obj.TYPE_MEM
 								pp.From.Reg = REG_GS
 								pp.From.Offset = 0
 								pp.From.Index = REG_NONE
 								pp.From.Scale = 0
 								ab.Put2(0x65, // GS
 									0x8B)
 								ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
 							}
 						case objabi.Hplan9:
 							pp.From = obj.Addr{}
 							pp.From.Type = obj.TYPE_MEM
 							pp.From.Name = obj.NAME_EXTERN
 							pp.From.Sym = plan9privates
 							pp.From.Offset = 0
 							pp.From.Index = REG_NONE
 							ab.Put1(0x8B)
 							ab.asmand(ctxt, cursym, p, &pp.From, &p.To)

 						case objabi.Hwindows:
 							// Windows TLS base is always 0x14(FS).
 							pp.From = p.From

 							pp.From.Type = obj.TYPE_MEM
 							pp.From.Reg = REG_FS
 							pp.From.Offset = 0x14
 							pp.From.Index = REG_NONE
 							pp.From.Scale = 0
 							ab.Put2(0x64, // FS
 								0x8B)
 							ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
 						}
 						break
 					}

 					switch ctxt.Headtype {
 					default:
 						log.Fatalf("unknown TLS base location for %v", ctxt.Headtype)

 					case objabi.Hlinux, objabi.Hfreebsd:
 						if !ctxt.Flag_shared {
 							log.Fatalf("unknown TLS base location for linux/freebsd without -shared")
 						}
 						// Note that this is not generating the same insn as the other cases.
 						//     MOV TLS, R_to
 						// becomes
 						//     movq g@gottpoff(%rip), R_to
 						// which is encoded as
 						//     movq 0(%rip), R_to
 						// and a R_TLS_IE reloc. This all assumes the only tls variable we access
 						// is g, which we can't check here, but will when we assemble the second
 						// instruction.
 						ab.rexflag = Pw | (regrex[p.To.Reg] & Rxr)

 						ab.Put2(0x8B, byte(0x05|(reg[p.To.Reg]<<3)))
 						r = obj.Addrel(cursym)
 						r.Off = int32(p.Pc + int64(ab.Len()))
 						r.Type = objabi.R_TLS_IE
 						r.Siz = 4
 						r.Add = -4
 						ab.PutInt32(0)

 					case objabi.Hplan9:
 						pp.From = obj.Addr{}
 						pp.From.Type = obj.TYPE_MEM
 						pp.From.Name = obj.NAME_EXTERN
 						pp.From.Sym = plan9privates
 						pp.From.Offset = 0
 						pp.From.Index = REG_NONE
 						ab.rexflag |= Pw
 						ab.Put1(0x8B)
 						ab.asmand(ctxt, cursym, p, &pp.From, &p.To)

 					case objabi.Hsolaris: // TODO(rsc): Delete Hsolaris from list. Should not use this code. See progedit in obj6.c.
 						// TLS base is 0(FS).
 						pp.From = p.From

 						pp.From.Type = obj.TYPE_MEM
 						pp.From.Name = obj.NAME_NONE
 						pp.From.Reg = REG_NONE
 						pp.From.Offset = 0
 						pp.From.Index = REG_NONE
 						pp.From.Scale = 0
 						ab.rexflag |= Pw
 						ab.Put2(0x64, // FS
 							0x8B)
 						ab.asmand(ctxt, cursym, p, &pp.From, &p.To)

 					case objabi.Hwindows:
 						// Windows TLS base is always 0x28(GS).
 						pp.From = p.From

 						pp.From.Type = obj.TYPE_MEM
 						pp.From.Name = obj.NAME_NONE
 						pp.From.Reg = REG_GS
 						pp.From.Offset = 0x28
 						pp.From.Index = REG_NONE
 						pp.From.Scale = 0
 						ab.rexflag |= Pw
 						ab.Put2(0x65, // GS
 							0x8B)
 						ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
 					}
 				}
 				return
 			}
 		}
 	}
 	goto bad

 bad:
 	if ctxt.Arch.Family != sys.AMD64 {
 		// here, the assembly has failed.
 		// if it's a byte instruction that has
 		// unaddressable registers, try to
 		// exchange registers and reissue the
 		// instruction with the operands renamed.
 		pp := *p

 		unbytereg(&pp.From, &pp.Ft)
 		unbytereg(&pp.To, &pp.Tt)

 		z := int(p.From.Reg)
 		if p.From.Type == obj.TYPE_REG && z >= REG_BP && z <= REG_DI {
 			// TODO(rsc): Use this code for x86-64 too. It has bug fixes not present in the amd64 code base.
 			// For now, different to keep bit-for-bit compatibility.
 			if ctxt.Arch.Family == sys.I386 {
 				breg := byteswapreg(ctxt, &p.To)
 				if breg != REG_AX {
 					ab.Put1(0x87) // xchg lhs,bx
 					ab.asmando(ctxt, cursym, p, &p.From, reg[breg])
 					subreg(&pp, z, breg)
 					ab.doasm(ctxt, cursym, &pp)
 					ab.Put1(0x87) // xchg lhs,bx
 					ab.asmando(ctxt, cursym, p, &p.From, reg[breg])
 				} else {
 					ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
 					subreg(&pp, z, REG_AX)
 					ab.doasm(ctxt, cursym, &pp)
 					ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
 				}
 				return
 			}

 			if isax(&p.To) || p.To.Type == obj.TYPE_NONE {
 				// We certainly don't want to exchange
 				// with AX if the op is MUL or DIV.
 				ab.Put1(0x87) // xchg lhs,bx
 				ab.asmando(ctxt, cursym, p, &p.From, reg[REG_BX])
 				subreg(&pp, z, REG_BX)
 				ab.doasm(ctxt, cursym, &pp)
 				ab.Put1(0x87) // xchg lhs,bx
 				ab.asmando(ctxt, cursym, p, &p.From, reg[REG_BX])
 			} else {
 				ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
 				subreg(&pp, z, REG_AX)
 				ab.doasm(ctxt, cursym, &pp)
 				ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
 			}
 			return
 		}

 		z = int(p.To.Reg)
 		if p.To.Type == obj.TYPE_REG && z >= REG_BP && z <= REG_DI {
 			// TODO(rsc): Use this code for x86-64 too. It has bug fixes not present in the amd64 code base.
 			// For now, different to keep bit-for-bit compatibility.
 			if ctxt.Arch.Family == sys.I386 {
 				breg := byteswapreg(ctxt, &p.From)
 				if breg != REG_AX {
 					ab.Put1(0x87) //xchg rhs,bx
 					ab.asmando(ctxt, cursym, p, &p.To, reg[breg])
 					subreg(&pp, z, breg)
 					ab.doasm(ctxt, cursym, &pp)
 					ab.Put1(0x87) // xchg rhs,bx
 					ab.asmando(ctxt, cursym, p, &p.To, reg[breg])
 				} else {
 					ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
 					subreg(&pp, z, REG_AX)
 					ab.doasm(ctxt, cursym, &pp)
 					ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
 				}
 				return
 			}

 			if isax(&p.From) {
 				ab.Put1(0x87) // xchg rhs,bx
 				ab.asmando(ctxt, cursym, p, &p.To, reg[REG_BX])
 				subreg(&pp, z, REG_BX)
 				ab.doasm(ctxt, cursym, &pp)
 				ab.Put1(0x87) // xchg rhs,bx
 				ab.asmando(ctxt, cursym, p, &p.To, reg[REG_BX])
 			} else {
 				ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
 				subreg(&pp, z, REG_AX)
 				ab.doasm(ctxt, cursym, &pp)
 				ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
 			}
 			return
 		}
 	}

 	ctxt.Diag("%s: invalid instruction: %v", cursym.Name, p)
 }

 // byteswapreg returns a byte-addressable register (AX, BX, CX, DX)
 // which is not referenced in a.
 // If a is empty, it returns BX to account for MULB-like instructions
 // that might use DX and AX.
 func byteswapreg(ctxt *obj.Link, a *obj.Addr) int {
 	cana, canb, canc, cand := true, true, true, true
 	if a.Type == obj.TYPE_NONE {
 		cana, cand = false, false
 	}

 	if a.Type == obj.TYPE_REG || ((a.Type == obj.TYPE_MEM || a.Type == obj.TYPE_ADDR) && a.Name == obj.NAME_NONE) {
 		switch a.Reg {
 		case REG_NONE:
 			cana, cand = false, false
 		case REG_AX, REG_AL, REG_AH:
 			cana = false
 		case REG_BX, REG_BL, REG_BH:
 			canb = false
 		case REG_CX, REG_CL, REG_CH:
 			canc = false
 		case REG_DX, REG_DL, REG_DH:
 			cand = false
 		}
 	}

 	if a.Type == obj.TYPE_MEM || a.Type == obj.TYPE_ADDR {
 		switch a.Index {
 		case REG_AX:
 			cana = false
 		case REG_BX:
 			canb = false
 		case REG_CX:
 			canc = false
 		case REG_DX:
 			cand = false
 		}
 	}

 	switch {
 	case cana:
 		return REG_AX
 	case canb:
 		return REG_BX
 	case canc:
 		return REG_CX
 	case cand:
 		return REG_DX
 	default:
 		ctxt.Diag("impossible byte register")
 		ctxt.DiagFlush()
 		log.Fatalf("bad code")
 		return 0
 	}
 }

 func isbadbyte(a *obj.Addr) bool {
 	return a.Type == obj.TYPE_REG && (REG_BP <= a.Reg && a.Reg <= REG_DI || REG_BPB <= a.Reg && a.Reg <= REG_DIB)
 }

 func (ab *AsmBuf) asmins(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
 	ab.Reset()

 	ab.rexflag = 0
 	ab.vexflag = false
 	ab.evexflag = false
 	mark := ab.Len()
 	ab.doasm(ctxt, cursym, p)
 	if ab.rexflag != 0 && !ab.vexflag && !ab.evexflag {
 		// as befits the whole approach of the architecture,
 		// the rex prefix must appear before the first opcode byte
 		// (and thus after any 66/67/f2/f3/26/2e/3e prefix bytes, but
 		// before the 0f opcode escape!), or it might be ignored.
 		// note that the handbook often misleadingly shows 66/f2/f3 in `opcode'.
 		if ctxt.Arch.Family != sys.AMD64 {
 			ctxt.Diag("asmins: illegal in mode %d: %v (%d %d)", ctxt.Arch.RegSize*8, p, p.Ft, p.Tt)
 		}
 		n := ab.Len()
 		var np int
 		for np = mark; np < n; np++ {
 			c := ab.At(np)
 			if c != 0xf2 && c != 0xf3 && (c < 0x64 || c > 0x67) && c != 0x2e && c != 0x3e && c != 0x26 {
 				break
 			}
 		}
 		ab.Insert(np, byte(0x40|ab.rexflag))
 	}

 	n := ab.Len()
 	for i := len(cursym.R) - 1; i >= 0; i-- {
 		r := &cursym.R[i]
 		if int64(r.Off) < p.Pc {
 			break
 		}
 		if ab.rexflag != 0 && !ab.vexflag && !ab.evexflag {
 			r.Off++
 		}
 		if r.Type == objabi.R_PCREL {
 			if ctxt.Arch.Family == sys.AMD64 || p.As == obj.AJMP || p.As == obj.ACALL {
 				// PC-relative addressing is relative to the end of the instruction,
 				// but the relocations applied by the linker are relative to the end
 				// of the relocation. Because immediate instruction
 				// arguments can follow the PC-relative memory reference in the
 				// instruction encoding, the two may not coincide. In this case,
 				// adjust addend so that linker can keep relocating relative to the
 				// end of the relocation.
 				r.Add -= p.Pc + int64(n) - (int64(r.Off) + int64(r.Siz))
 			} else if ctxt.Arch.Family == sys.I386 {
 				// On 386 PC-relative addressing (for non-call/jmp instructions)
 				// assumes that the previous instruction loaded the PC of the end
 				// of that instruction into CX, so the adjustment is relative to
 				// that.
 				r.Add += int64(r.Off) - p.Pc + int64(r.Siz)
 			}
 		}
 		if r.Type == objabi.R_GOTPCREL && ctxt.Arch.Family == sys.I386 {
 			// On 386, R_GOTPCREL makes the same assumptions as R_PCREL.
 			r.Add += int64(r.Off) - p.Pc + int64(r.Siz)
 		}

 	}
 }

 // unpackOps4 extracts 4 operands from p.
 func unpackOps4(p *obj.Prog) (arg0, arg1, arg2, dst *obj.Addr) {
 	return &p.From, &p.RestArgs[0].Addr, &p.RestArgs[1].Addr, &p.To
 }

 // unpackOps5 extracts 5 operands from p.
 func unpackOps5(p *obj.Prog) (arg0, arg1, arg2, arg3, dst *obj.Addr) {
 	return &p.From, &p.RestArgs[0].Addr, &p.RestArgs[1].Addr, &p.RestArgs[2].Addr, &p.To
 }