vector: use asm opcode mnemonics
There's no change in the binary output, just less mystery in the asm.
These mnemonics were introduced in Go 1.10:
https://golang.org/doc/go1.10#asm and https://golang.org/cl/75490
Current stable release (as of 2018-11-10) is Go 1.11, and
https://golang.org/doc/devel/release.html#policy says that Go 1.9 and
below are therefore no longer supported.
Change-Id: I1f9a63521bc8d5e8f8d395605f62bf7fb6a63bc5
Reviewed-on: https://go-review.googlesource.com/c/148997
Reviewed-by: Dmitri Shuralyov <dmitshur@golang.org>
diff --git a/vector/acc_amd64.s b/vector/acc_amd64.s
index 69e0fc2..fc6e7f8 100644
--- a/vector/acc_amd64.s
+++ b/vector/acc_amd64.s
@@ -139,17 +139,9 @@
// y = abs(x)
// y >>= 2 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
- //
- // pabsd %xmm1,%xmm2
- // psrld $0x2,%xmm2
- // pminud %xmm5,%xmm2
- //
- // Hopefully we'll get these opcode mnemonics into the assembler for Go
- // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
- // it's similar.
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
- BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+ PABSD X1, X2
+ PSRLL $2, X2
+ PMINUD X5, X2
// z = convertToInt32(y)
// No-op.
@@ -182,13 +174,10 @@
PSRLQ $32, X11
// Multiply by magic, shift by magic.
- //
- // pmuludq %xmm10,%xmm0
- // pmuludq %xmm10,%xmm11
- BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2
- BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda
- PSRLQ $47, X0
- PSRLQ $47, X11
+ PMULULQ X10, X0
+ PMULULQ X10, X11
+ PSRLQ $47, X0
+ PSRLQ $47, X11
// Merge the two registers back to one, X11, and add maskA.
PSLLQ $32, X11
@@ -223,17 +212,9 @@
// y = abs(x)
// y >>= 2 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
- //
- // pabsd %xmm1,%xmm2
- // psrld $0x2,%xmm2
- // pminud %xmm5,%xmm2
- //
- // Hopefully we'll get these opcode mnemonics into the assembler for Go
- // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
- // it's similar.
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
- BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+ PABSD X1, X2
+ PSRLL $2, X2
+ PMINUD X5, X2
// z = convertToInt32(y)
// No-op.
@@ -346,17 +327,9 @@
// y = abs(x)
// y >>= 2 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
- //
- // pabsd %xmm1,%xmm2
- // psrld $0x2,%xmm2
- // pminud %xmm5,%xmm2
- //
- // Hopefully we'll get these opcode mnemonics into the assembler for Go
- // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
- // it's similar.
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
- BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+ PABSD X1, X2
+ PSRLL $2, X2
+ PMINUD X5, X2
// z = convertToInt32(y)
// No-op.
@@ -390,17 +363,9 @@
// y = abs(x)
// y >>= 2 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
- //
- // pabsd %xmm1,%xmm2
- // psrld $0x2,%xmm2
- // pminud %xmm5,%xmm2
- //
- // Hopefully we'll get these opcode mnemonics into the assembler for Go
- // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
- // it's similar.
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
- BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+ PABSD X1, X2
+ PSRLL $2, X2
+ PMINUD X5, X2
// z = convertToInt32(y)
// No-op.
@@ -492,17 +457,9 @@
// y = abs(x)
// y >>= 2 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
- //
- // pabsd %xmm1,%xmm2
- // psrld $0x2,%xmm2
- // pminud %xmm5,%xmm2
- //
- // Hopefully we'll get these opcode mnemonics into the assembler for Go
- // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
- // it's similar.
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
- BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+ PABSD X1, X2
+ PSRLL $2, X2
+ PMINUD X5, X2
// z = convertToInt32(y)
// No-op.
@@ -534,17 +491,9 @@
// y = abs(x)
// y >>= 2 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
- //
- // pabsd %xmm1,%xmm2
- // psrld $0x2,%xmm2
- // pminud %xmm5,%xmm2
- //
- // Hopefully we'll get these opcode mnemonics into the assembler for Go
- // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
- // it's similar.
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
- BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+ PABSD X1, X2
+ PSRLL $2, X2
+ PMINUD X5, X2
// z = convertToInt32(y)
// No-op.
@@ -696,13 +645,10 @@
PSRLQ $32, X11
// Multiply by magic, shift by magic.
- //
- // pmuludq %xmm10,%xmm0
- // pmuludq %xmm10,%xmm11
- BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2
- BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda
- PSRLQ $47, X0
- PSRLQ $47, X11
+ PMULULQ X10, X0
+ PMULULQ X10, X11
+ PSRLQ $47, X0
+ PSRLQ $47, X11
// Merge the two registers back to one, X11, and add maskA.
PSLLQ $32, X11
diff --git a/vector/gen.go b/vector/gen.go
index 28b298b..2e71a51 100644
--- a/vector/gen.go
+++ b/vector/gen.go
@@ -296,17 +296,9 @@
// y = abs(x)
// y >>= 2 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
- //
- // pabsd %xmm1,%xmm2
- // psrld $0x2,%xmm2
- // pminud %xmm5,%xmm2
- //
- // Hopefully we'll get these opcode mnemonics into the assembler for Go
- // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
- // it's similar.
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
- BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+ PABSD X1, X2
+ PSRLL $2, X2
+ PMINUD X5, X2
`
flClampAndScale = `
// y = x & flSignMask
@@ -356,13 +348,10 @@
MOVOU X0, X11
PSRLQ $32, X11
// Multiply by magic, shift by magic.
- //
- // pmuludq %xmm10,%xmm0
- // pmuludq %xmm10,%xmm11
- BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2
- BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda
- PSRLQ $47, X0
- PSRLQ $47, X11
+ PMULULQ X10, X0
+ PMULULQ X10, X11
+ PSRLQ $47, X0
+ PSRLQ $47, X11
// Merge the two registers back to one, X11, and add maskA.
PSLLQ $32, X11
XORPS X0, X11