crypto/internal/poly1305: implement function update in assembly on loong64

The performance improvements on Loongson-3A5000 and Loongson-3A6000 are as follows:

goos: linux
goarch: loong64
pkg: golang.org/x/crypto/internal/poly1305
cpu: Loongson-3A5000 @ 2500.00MHz
                 |  bench.old   |              bench.new              |
                 |    sec/op    |   sec/op     vs base                |
64                  122.8n ± 0%   100.0n ± 0%  -18.57% (p=0.000 n=10)
1K                 1152.0n ± 0%   732.2n ± 0%  -36.44% (p=0.000 n=10)
2M                  2.356m ± 0%   1.443m ± 0%  -38.74% (p=0.000 n=10)
64Unaligned         122.7n ± 0%   101.5n ± 0%  -17.24% (p=0.000 n=10)
1KUnaligned        1152.0n ± 0%   745.4n ± 0%  -35.30% (p=0.000 n=10)
2MUnaligned         2.336m ± 0%   1.473m ± 0%  -36.94% (p=0.000 n=10)
Write64             77.92n ± 0%   54.88n ± 0%  -29.57% (p=0.000 n=10)
Write1K            1106.0n ± 0%   683.3n ± 0%  -38.22% (p=0.000 n=10)
Write2M             2.356m ± 0%   1.444m ± 0%  -38.72% (p=0.000 n=10)
Write64Unaligned    77.87n ± 0%   55.69n ± 0%  -28.49% (p=0.000 n=10)
Write1KUnaligned   1106.0n ± 0%   708.1n ± 0%  -35.97% (p=0.000 n=10)
Write2MUnaligned    2.335m ± 0%   1.471m ± 0%  -37.01% (p=0.000 n=10)
geomean             6.373µ        4.272µ       -32.96%

                 |  bench.old   |               bench.new               |
                 |     B/s      |      B/s       vs base                |
64                 497.1Mi ± 0%    610.3Mi ± 0%  +22.78% (p=0.000 n=10)
1K                 847.6Mi ± 0%   1333.7Mi ± 0%  +57.35% (p=0.000 n=10)
2M                 849.0Mi ± 0%   1385.9Mi ± 0%  +63.24% (p=0.000 n=10)
64Unaligned        497.4Mi ± 0%    600.9Mi ± 0%  +20.81% (p=0.000 n=10)
1KUnaligned        847.6Mi ± 0%   1310.1Mi ± 0%  +54.57% (p=0.000 n=10)
2MUnaligned        856.3Mi ± 0%   1357.9Mi ± 0%  +58.58% (p=0.000 n=10)
Write64            783.3Mi ± 0%   1112.2Mi ± 0%  +41.99% (p=0.000 n=10)
Write1K            882.8Mi ± 0%   1429.1Mi ± 0%  +61.88% (p=0.000 n=10)
Write2M            849.0Mi ± 0%   1385.4Mi ± 0%  +63.18% (p=0.000 n=10)
Write64Unaligned   783.8Mi ± 0%   1096.1Mi ± 0%  +39.85% (p=0.000 n=10)
Write1KUnaligned   882.8Mi ± 0%   1379.0Mi ± 0%  +56.20% (p=0.000 n=10)
Write2MUnaligned   856.5Mi ± 0%   1359.9Mi ± 0%  +58.76% (p=0.000 n=10)
geomean            772.2Mi         1.125Gi       +49.18%

goos: linux
goarch: loong64
pkg: golang.org/x/crypto/internal/poly1305
cpu: Loongson-3A6000-HV @ 2500.00MHz
                 |  bench.old  |              bench.new              |
                 |   sec/op    |   sec/op     vs base                |
64                 92.06n ± 0%   71.55n ± 0%  -22.28% (p=0.000 n=10)
1K                 998.4n ± 0%   607.7n ± 0%  -39.13% (p=0.000 n=10)
2M                 1.976m ± 0%   1.165m ± 0%  -41.07% (p=0.000 n=10)
64Unaligned        92.05n ± 0%   71.55n ± 0%  -22.27% (p=0.000 n=10)
1KUnaligned        998.3n ± 0%   607.6n ± 0%  -39.13% (p=0.000 n=10)
2MUnaligned        1.975m ± 0%   1.222m ± 0%  -38.11% (p=0.000 n=10)
Write64            65.24n ± 0%   45.23n ± 0%  -30.67% (p=0.000 n=10)
Write1K            970.8n ± 0%   577.6n ± 0%  -40.51% (p=0.000 n=10)
Write2M            1.965m ± 0%   1.163m ± 0%  -40.81% (p=0.000 n=10)
Write64Unaligned   65.24n ± 0%   45.24n ± 0%  -30.66% (p=0.000 n=10)
Write1KUnaligned   970.8n ± 0%   577.6n ± 0%  -40.50% (p=0.000 n=10)
Write2MUnaligned   1.965m ± 0%   1.222m ± 0%  -37.81% (p=0.000 n=10)
geomean            5.317µ        3.426µ       -35.58%

                 |   bench.old   |               bench.new               |
                 |      B/s      |      B/s       vs base                |
64                  663.0Mi ± 0%    853.1Mi ± 0%  +28.67% (p=0.000 n=10)
1K                  978.1Mi ± 0%   1606.9Mi ± 0%  +64.28% (p=0.000 n=10)
2M                 1012.0Mi ± 0%   1717.4Mi ± 0%  +69.70% (p=0.000 n=10)
64Unaligned         663.1Mi ± 0%    853.1Mi ± 0%  +28.65% (p=0.000 n=10)
1KUnaligned         978.2Mi ± 0%   1607.1Mi ± 0%  +64.29% (p=0.000 n=10)
2MUnaligned        1012.6Mi ± 0%   1636.2Mi ± 0%  +61.58% (p=0.000 n=10)
Write64             935.5Mi ± 0%   1349.3Mi ± 0%  +44.23% (p=0.000 n=10)
Write1K            1005.9Mi ± 0%   1690.9Mi ± 0%  +68.09% (p=0.000 n=10)
Write2M            1017.7Mi ± 0%   1719.5Mi ± 0%  +68.95% (p=0.000 n=10)
Write64Unaligned    935.6Mi ± 0%   1349.3Mi ± 0%  +44.22% (p=0.000 n=10)
Write1KUnaligned   1006.0Mi ± 0%   1690.9Mi ± 0%  +68.08% (p=0.000 n=10)
Write2MUnaligned   1017.7Mi ± 0%   1636.4Mi ± 0%  +60.80% (p=0.000 n=10)
geomean             925.6Mi         1.403Gi       +55.22%

Change-Id: If05a8bfc868b3e6f903ff169eed7a894af741f9b
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/638455
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
diff --git a/internal/poly1305/mac_noasm.go b/internal/poly1305/mac_noasm.go
index bd896bd..8d99551 100644
--- a/internal/poly1305/mac_noasm.go
+++ b/internal/poly1305/mac_noasm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (!amd64 && !ppc64le && !ppc64 && !s390x) || !gc || purego
+//go:build (!amd64 && !loong64 && !ppc64le && !ppc64 && !s390x) || !gc || purego
 
 package poly1305
 
diff --git a/internal/poly1305/sum_amd64.go b/internal/poly1305/sum_asm.go
similarity index 93%
rename from internal/poly1305/sum_amd64.go
rename to internal/poly1305/sum_asm.go
index 164cd47..315b84a 100644
--- a/internal/poly1305/sum_amd64.go
+++ b/internal/poly1305/sum_asm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build gc && !purego
+//go:build gc && !purego && (amd64 || loong64 || ppc64 || ppc64le)
 
 package poly1305
 
diff --git a/internal/poly1305/sum_loong64.s b/internal/poly1305/sum_loong64.s
new file mode 100644
index 0000000..bc8361d
--- /dev/null
+++ b/internal/poly1305/sum_loong64.s
@@ -0,0 +1,123 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+
+// func update(state *macState, msg []byte)
+TEXT ·update(SB), $0-32
+	MOVV	state+0(FP), R4
+	MOVV	msg_base+8(FP), R5
+	MOVV	msg_len+16(FP), R6
+
+	MOVV	$0x10, R7
+
+	MOVV	(R4), R8	// h0
+	MOVV	8(R4), R9	// h1
+	MOVV	16(R4), R10	// h2
+	MOVV	24(R4), R11	// r0
+	MOVV	32(R4), R12	// r1
+
+	BLT	R6, R7, bytes_between_0_and_15
+
+loop:
+	MOVV	(R5), R14	// msg[0:8]
+	MOVV	8(R5), R16	// msg[8:16]
+	ADDV	R14, R8, R8	// h0 (x1 + y1 = z1', if z1' < x1 then z1' overflow)
+	ADDV	R16, R9, R27
+	SGTU	R14, R8, R24	// h0.carry
+	SGTU	R9, R27, R28
+	ADDV	R27, R24, R9	// h1
+	SGTU	R27, R9, R24
+	OR	R24, R28, R24	// h1.carry
+	ADDV	$0x01, R24, R24
+	ADDV	R10, R24, R10	// h2
+
+	ADDV	$16, R5, R5	// msg = msg[16:]
+
+multiply:
+	MULV	R8, R11, R14	// h0r0.lo
+	MULHVU	R8, R11, R15	// h0r0.hi
+	MULV	R9, R11, R13	// h1r0.lo
+	MULHVU	R9, R11, R16	// h1r0.hi
+	ADDV	R13, R15, R15
+	SGTU	R13, R15, R24
+	ADDV	R24, R16, R16
+	MULV	R10, R11, R25
+	ADDV	R16, R25, R25
+	MULV	R8, R12, R13	// h0r1.lo
+	MULHVU	R8, R12, R16	// h0r1.hi
+	ADDV	R13, R15, R15
+	SGTU	R13, R15, R24
+	ADDV	R24, R16, R16
+	MOVV	R16, R8
+	MULV	R10, R12, R26	// h2r1
+	MULV	R9, R12, R13	// h1r1.lo
+	MULHVU	R9, R12, R16	// h1r1.hi
+	ADDV	R13, R25, R25
+	ADDV	R16, R26, R27
+	SGTU	R13, R25, R24
+	ADDV	R27, R24, R26
+	ADDV	R8, R25, R25
+	SGTU	R8, R25, R24
+	ADDV	R24, R26, R26
+	AND	$3, R25, R10
+	AND	$-4, R25, R17
+	ADDV	R17, R14, R8
+	ADDV	R26, R15, R27
+	SGTU	R17, R8, R24
+	SGTU	R26, R27, R28
+	ADDV	R27, R24, R9
+	SGTU	R27, R9, R24
+	OR	R24, R28, R24
+	ADDV	R24, R10, R10
+	SLLV	$62, R26, R27
+	SRLV	$2, R25, R28
+	SRLV	$2, R26, R26
+	OR	R27, R28, R25
+	ADDV	R25, R8, R8
+	ADDV	R26, R9, R27
+	SGTU	R25, R8, R24
+	SGTU	R26, R27, R28
+	ADDV	R27, R24, R9
+	SGTU	R27, R9, R24
+	OR	R24, R28, R24
+	ADDV	R24, R10, R10
+
+	SUBV	$16, R6, R6
+	BGE	R6, R7, loop
+
+bytes_between_0_and_15:
+	BEQ	R6, R0, done
+	MOVV	$1, R14
+	XOR	R15, R15
+	ADDV	R6, R5, R5
+
+flush_buffer:
+	MOVBU	-1(R5), R25
+	SRLV	$56, R14, R24
+	SLLV	$8, R15, R28
+	SLLV	$8, R14, R14
+	OR	R24, R28, R15
+	XOR	R25, R14, R14
+	SUBV	$1, R6, R6
+	SUBV	$1, R5, R5
+	BNE	R6, R0, flush_buffer
+
+	ADDV	R14, R8, R8
+	SGTU	R14, R8, R24
+	ADDV	R15, R9, R27
+	SGTU	R15, R27, R28
+	ADDV	R27, R24, R9
+	SGTU	R27, R9, R24
+	OR	R24, R28, R24
+	ADDV	R10, R24, R10
+
+	MOVV	$16, R6
+	JMP	multiply
+
+done:
+	MOVV	R8, (R4)
+	MOVV	R9, 8(R4)
+	MOVV	R10, 16(R4)
+	RET
diff --git a/internal/poly1305/sum_ppc64x.go b/internal/poly1305/sum_ppc64x.go
deleted file mode 100644
index 1a1679a..0000000
--- a/internal/poly1305/sum_ppc64x.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2019 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build gc && !purego && (ppc64 || ppc64le)
-
-package poly1305
-
-//go:noescape
-func update(state *macState, msg []byte)
-
-// mac is a wrapper for macGeneric that redirects calls that would have gone to
-// updateGeneric to update.
-//
-// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
-// using function pointers would carry a major performance cost.
-type mac struct{ macGeneric }
-
-func (h *mac) Write(p []byte) (int, error) {
-	nn := len(p)
-	if h.offset > 0 {
-		n := copy(h.buffer[h.offset:], p)
-		if h.offset+n < TagSize {
-			h.offset += n
-			return nn, nil
-		}
-		p = p[n:]
-		h.offset = 0
-		update(&h.macState, h.buffer[:])
-	}
-	if n := len(p) - (len(p) % TagSize); n > 0 {
-		update(&h.macState, p[:n])
-		p = p[n:]
-	}
-	if len(p) > 0 {
-		h.offset += copy(h.buffer[h.offset:], p)
-	}
-	return nn, nil
-}
-
-func (h *mac) Sum(out *[16]byte) {
-	state := h.macState
-	if h.offset > 0 {
-		update(&state, h.buffer[:h.offset])
-	}
-	finalize(out, &state.h, &state.s)
-}