crypto/internal/poly1305: implement function update in assembly on loong64
The performance improvements on Loongson-3A5000 and Loongson-3A6000 are as follows:
goos: linux
goarch: loong64
pkg: golang.org/x/crypto/internal/poly1305
cpu: Loongson-3A5000 @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
64 122.8n ± 0% 100.0n ± 0% -18.57% (p=0.000 n=10)
1K 1152.0n ± 0% 732.2n ± 0% -36.44% (p=0.000 n=10)
2M 2.356m ± 0% 1.443m ± 0% -38.74% (p=0.000 n=10)
64Unaligned 122.7n ± 0% 101.5n ± 0% -17.24% (p=0.000 n=10)
1KUnaligned 1152.0n ± 0% 745.4n ± 0% -35.30% (p=0.000 n=10)
2MUnaligned 2.336m ± 0% 1.473m ± 0% -36.94% (p=0.000 n=10)
Write64 77.92n ± 0% 54.88n ± 0% -29.57% (p=0.000 n=10)
Write1K 1106.0n ± 0% 683.3n ± 0% -38.22% (p=0.000 n=10)
Write2M 2.356m ± 0% 1.444m ± 0% -38.72% (p=0.000 n=10)
Write64Unaligned 77.87n ± 0% 55.69n ± 0% -28.49% (p=0.000 n=10)
Write1KUnaligned 1106.0n ± 0% 708.1n ± 0% -35.97% (p=0.000 n=10)
Write2MUnaligned 2.335m ± 0% 1.471m ± 0% -37.01% (p=0.000 n=10)
geomean 6.373µ 4.272µ -32.96%
| bench.old | bench.new |
| B/s | B/s vs base |
64 497.1Mi ± 0% 610.3Mi ± 0% +22.78% (p=0.000 n=10)
1K 847.6Mi ± 0% 1333.7Mi ± 0% +57.35% (p=0.000 n=10)
2M 849.0Mi ± 0% 1385.9Mi ± 0% +63.24% (p=0.000 n=10)
64Unaligned 497.4Mi ± 0% 600.9Mi ± 0% +20.81% (p=0.000 n=10)
1KUnaligned 847.6Mi ± 0% 1310.1Mi ± 0% +54.57% (p=0.000 n=10)
2MUnaligned 856.3Mi ± 0% 1357.9Mi ± 0% +58.58% (p=0.000 n=10)
Write64 783.3Mi ± 0% 1112.2Mi ± 0% +41.99% (p=0.000 n=10)
Write1K 882.8Mi ± 0% 1429.1Mi ± 0% +61.88% (p=0.000 n=10)
Write2M 849.0Mi ± 0% 1385.4Mi ± 0% +63.18% (p=0.000 n=10)
Write64Unaligned 783.8Mi ± 0% 1096.1Mi ± 0% +39.85% (p=0.000 n=10)
Write1KUnaligned 882.8Mi ± 0% 1379.0Mi ± 0% +56.20% (p=0.000 n=10)
Write2MUnaligned 856.5Mi ± 0% 1359.9Mi ± 0% +58.76% (p=0.000 n=10)
geomean 772.2Mi 1.125Gi +49.18%
goos: linux
goarch: loong64
pkg: golang.org/x/crypto/internal/poly1305
cpu: Loongson-3A6000-HV @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
64 92.06n ± 0% 71.55n ± 0% -22.28% (p=0.000 n=10)
1K 998.4n ± 0% 607.7n ± 0% -39.13% (p=0.000 n=10)
2M 1.976m ± 0% 1.165m ± 0% -41.07% (p=0.000 n=10)
64Unaligned 92.05n ± 0% 71.55n ± 0% -22.27% (p=0.000 n=10)
1KUnaligned 998.3n ± 0% 607.6n ± 0% -39.13% (p=0.000 n=10)
2MUnaligned 1.975m ± 0% 1.222m ± 0% -38.11% (p=0.000 n=10)
Write64 65.24n ± 0% 45.23n ± 0% -30.67% (p=0.000 n=10)
Write1K 970.8n ± 0% 577.6n ± 0% -40.51% (p=0.000 n=10)
Write2M 1.965m ± 0% 1.163m ± 0% -40.81% (p=0.000 n=10)
Write64Unaligned 65.24n ± 0% 45.24n ± 0% -30.66% (p=0.000 n=10)
Write1KUnaligned 970.8n ± 0% 577.6n ± 0% -40.50% (p=0.000 n=10)
Write2MUnaligned 1.965m ± 0% 1.222m ± 0% -37.81% (p=0.000 n=10)
geomean 5.317µ 3.426µ -35.58%
| bench.old | bench.new |
| B/s | B/s vs base |
64 663.0Mi ± 0% 853.1Mi ± 0% +28.67% (p=0.000 n=10)
1K 978.1Mi ± 0% 1606.9Mi ± 0% +64.28% (p=0.000 n=10)
2M 1012.0Mi ± 0% 1717.4Mi ± 0% +69.70% (p=0.000 n=10)
64Unaligned 663.1Mi ± 0% 853.1Mi ± 0% +28.65% (p=0.000 n=10)
1KUnaligned 978.2Mi ± 0% 1607.1Mi ± 0% +64.29% (p=0.000 n=10)
2MUnaligned 1012.6Mi ± 0% 1636.2Mi ± 0% +61.58% (p=0.000 n=10)
Write64 935.5Mi ± 0% 1349.3Mi ± 0% +44.23% (p=0.000 n=10)
Write1K 1005.9Mi ± 0% 1690.9Mi ± 0% +68.09% (p=0.000 n=10)
Write2M 1017.7Mi ± 0% 1719.5Mi ± 0% +68.95% (p=0.000 n=10)
Write64Unaligned 935.6Mi ± 0% 1349.3Mi ± 0% +44.22% (p=0.000 n=10)
Write1KUnaligned 1006.0Mi ± 0% 1690.9Mi ± 0% +68.08% (p=0.000 n=10)
Write2MUnaligned 1017.7Mi ± 0% 1636.4Mi ± 0% +60.80% (p=0.000 n=10)
geomean 925.6Mi 1.403Gi +55.22%
Change-Id: If05a8bfc868b3e6f903ff169eed7a894af741f9b
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/638455
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
diff --git a/internal/poly1305/mac_noasm.go b/internal/poly1305/mac_noasm.go
index bd896bd..8d99551 100644
--- a/internal/poly1305/mac_noasm.go
+++ b/internal/poly1305/mac_noasm.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build (!amd64 && !ppc64le && !ppc64 && !s390x) || !gc || purego
+//go:build (!amd64 && !loong64 && !ppc64le && !ppc64 && !s390x) || !gc || purego
package poly1305
diff --git a/internal/poly1305/sum_amd64.go b/internal/poly1305/sum_asm.go
similarity index 93%
rename from internal/poly1305/sum_amd64.go
rename to internal/poly1305/sum_asm.go
index 164cd47..315b84a 100644
--- a/internal/poly1305/sum_amd64.go
+++ b/internal/poly1305/sum_asm.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build gc && !purego
+//go:build gc && !purego && (amd64 || loong64 || ppc64 || ppc64le)
package poly1305
diff --git a/internal/poly1305/sum_loong64.s b/internal/poly1305/sum_loong64.s
new file mode 100644
index 0000000..bc8361d
--- /dev/null
+++ b/internal/poly1305/sum_loong64.s
@@ -0,0 +1,123 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+
+// func update(state *macState, msg []byte)
+TEXT ·update(SB), $0-32
+ MOVV state+0(FP), R4
+ MOVV msg_base+8(FP), R5
+ MOVV msg_len+16(FP), R6
+
+ MOVV $0x10, R7
+
+ MOVV (R4), R8 // h0
+ MOVV 8(R4), R9 // h1
+ MOVV 16(R4), R10 // h2
+ MOVV 24(R4), R11 // r0
+ MOVV 32(R4), R12 // r1
+
+ BLT R6, R7, bytes_between_0_and_15
+
+loop:
+ MOVV (R5), R14 // msg[0:8]
+ MOVV 8(R5), R16 // msg[8:16]
+ ADDV R14, R8, R8 // h0 (x1 + y1 = z1', if z1' < x1 then z1' overflow)
+ ADDV R16, R9, R27
+ SGTU R14, R8, R24 // h0.carry
+ SGTU R9, R27, R28
+ ADDV R27, R24, R9 // h1
+ SGTU R27, R9, R24
+ OR R24, R28, R24 // h1.carry
+ ADDV $0x01, R24, R24
+ ADDV R10, R24, R10 // h2
+
+ ADDV $16, R5, R5 // msg = msg[16:]
+
+multiply:
+ MULV R8, R11, R14 // h0r0.lo
+ MULHVU R8, R11, R15 // h0r0.hi
+ MULV R9, R11, R13 // h1r0.lo
+ MULHVU R9, R11, R16 // h1r0.hi
+ ADDV R13, R15, R15
+ SGTU R13, R15, R24
+ ADDV R24, R16, R16
+ MULV R10, R11, R25
+ ADDV R16, R25, R25
+ MULV R8, R12, R13 // h0r1.lo
+ MULHVU R8, R12, R16 // h0r1.hi
+ ADDV R13, R15, R15
+ SGTU R13, R15, R24
+ ADDV R24, R16, R16
+ MOVV R16, R8
+ MULV R10, R12, R26 // h2r1
+ MULV R9, R12, R13 // h1r1.lo
+ MULHVU R9, R12, R16 // h1r1.hi
+ ADDV R13, R25, R25
+ ADDV R16, R26, R27
+ SGTU R13, R25, R24
+ ADDV R27, R24, R26
+ ADDV R8, R25, R25
+ SGTU R8, R25, R24
+ ADDV R24, R26, R26
+ AND $3, R25, R10
+ AND $-4, R25, R17
+ ADDV R17, R14, R8
+ ADDV R26, R15, R27
+ SGTU R17, R8, R24
+ SGTU R26, R27, R28
+ ADDV R27, R24, R9
+ SGTU R27, R9, R24
+ OR R24, R28, R24
+ ADDV R24, R10, R10
+ SLLV $62, R26, R27
+ SRLV $2, R25, R28
+ SRLV $2, R26, R26
+ OR R27, R28, R25
+ ADDV R25, R8, R8
+ ADDV R26, R9, R27
+ SGTU R25, R8, R24
+ SGTU R26, R27, R28
+ ADDV R27, R24, R9
+ SGTU R27, R9, R24
+ OR R24, R28, R24
+ ADDV R24, R10, R10
+
+ SUBV $16, R6, R6
+ BGE R6, R7, loop
+
+bytes_between_0_and_15:
+ BEQ R6, R0, done
+ MOVV $1, R14
+ XOR R15, R15
+ ADDV R6, R5, R5
+
+flush_buffer:
+ MOVBU -1(R5), R25
+ SRLV $56, R14, R24
+ SLLV $8, R15, R28
+ SLLV $8, R14, R14
+ OR R24, R28, R15
+ XOR R25, R14, R14
+ SUBV $1, R6, R6
+ SUBV $1, R5, R5
+ BNE R6, R0, flush_buffer
+
+ ADDV R14, R8, R8
+ SGTU R14, R8, R24
+ ADDV R15, R9, R27
+ SGTU R15, R27, R28
+ ADDV R27, R24, R9
+ SGTU R27, R9, R24
+ OR R24, R28, R24
+ ADDV R10, R24, R10
+
+ MOVV $16, R6
+ JMP multiply
+
+done:
+ MOVV R8, (R4)
+ MOVV R9, 8(R4)
+ MOVV R10, 16(R4)
+ RET
diff --git a/internal/poly1305/sum_ppc64x.go b/internal/poly1305/sum_ppc64x.go
deleted file mode 100644
index 1a1679a..0000000
--- a/internal/poly1305/sum_ppc64x.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2019 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build gc && !purego && (ppc64 || ppc64le)
-
-package poly1305
-
-//go:noescape
-func update(state *macState, msg []byte)
-
-// mac is a wrapper for macGeneric that redirects calls that would have gone to
-// updateGeneric to update.
-//
-// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
-// using function pointers would carry a major performance cost.
-type mac struct{ macGeneric }
-
-func (h *mac) Write(p []byte) (int, error) {
- nn := len(p)
- if h.offset > 0 {
- n := copy(h.buffer[h.offset:], p)
- if h.offset+n < TagSize {
- h.offset += n
- return nn, nil
- }
- p = p[n:]
- h.offset = 0
- update(&h.macState, h.buffer[:])
- }
- if n := len(p) - (len(p) % TagSize); n > 0 {
- update(&h.macState, p[:n])
- p = p[n:]
- }
- if len(p) > 0 {
- h.offset += copy(h.buffer[h.offset:], p)
- }
- return nn, nil
-}
-
-func (h *mac) Sum(out *[16]byte) {
- state := h.macState
- if h.offset > 0 {
- update(&state, h.buffer[:h.offset])
- }
- finalize(out, &state.h, &state.s)
-}