[dev.boringcrypto] all: merge master into dev.boringcrypto Change-Id: Ic5f71c04f08c03319c043f35be501875adb0a3b0

commit: ec7f5165ddc680efbac18dc15b4905844d9e8db9 [log] [tgz]
author: Chressie Himpel <chressie@google.com> Wed Apr 27 20:09:28 2022 +0200
committer: Chressie Himpel <chressie@google.com> Wed Apr 27 20:09:28 2022 +0200
tree: eacc43345e3d6f0adfda16bfcf66e7e5096a85b9
parent: ca6fd39cf6498d4507fc7cdaced55620c283a503 [diff]
parent: f0ee7fda636408b4f04ca3f3b11788f662c90610 [diff]
diff --git a/README.md b/README.md
index 829fe77..e40f3aa 100644
--- a/README.md
+++ b/README.md

@@ -35,7 +35,7 @@
 To contribute, please read the contribution guidelines at https://go.dev/doc/contribute.
 
 Note that the Go project uses the issue tracker for bug reports and
-proposals only. See https://golang.org/wiki/Questions for a list of
+proposals only. See https://go.dev/wiki/Questions for a list of
 places to ask questions about the Go language.
 
 [rf]: https://reneefrench.blogspot.com/

diff --git a/SECURITY.md b/SECURITY.md
index 9e92e8b..ab608f3 100644
--- a/SECURITY.md
+++ b/SECURITY.md

@@ -2,12 +2,12 @@
 
 ## Supported Versions
 
-We support the past two Go releases (for example, Go 1.12.x and Go 1.13.x).
+We support the past two Go releases (for example, Go 1.17.x and Go 1.18.x when Go 1.18.x is the latest stable release).
 
-See https://golang.org/wiki/Go-Release-Cycle and in particular the
-[Release Maintenance](https://github.com/golang/go/wiki/Go-Release-Cycle#release-maintenance)
+See https://go.dev/wiki/Go-Release-Cycle and in particular the
+[Release Maintenance](https://go.dev/wiki/Go-Release-Cycle#release-maintenance)
 part of that page.
 
 ## Reporting a Vulnerability
 
-See https://golang.org/security for how to report a vulnerability.
+See https://go.dev/security for how to report a vulnerability.

diff --git a/api/next/30715.txt b/api/next/30715.txt
new file mode 100644
index 0000000..077a8d1
--- /dev/null
+++ b/api/next/30715.txt

@@ -0,0 +1,3 @@
+pkg net/http, type MaxBytesError struct #30715
+pkg net/http, type MaxBytesError struct, Limit int64 #30715
+pkg net/http, method (*MaxBytesError) Error() string #30715

diff --git a/api/next/50599.txt b/api/next/50599.txt
new file mode 100644
index 0000000..be271ea
--- /dev/null
+++ b/api/next/50599.txt

@@ -0,0 +1 @@
+pkg os/exec, method (*Cmd) Environ() []string #50599

diff --git a/api/next/51684.txt b/api/next/51684.txt
new file mode 100644
index 0000000..b8a0645
--- /dev/null
+++ b/api/next/51684.txt

@@ -0,0 +1,2 @@
+pkg regexp/syntax, const ErrNestingDepth = "expression nests too deeply" #51684
+pkg regexp/syntax, const ErrNestingDepth ErrorCode #51684

diff --git a/api/next/regexpdepth.txt b/api/next/regexpdepth.txt
deleted file mode 100644
index 9810218..0000000
--- a/api/next/regexpdepth.txt
+++ /dev/null

@@ -1,3 +0,0 @@
-pkg regexp/syntax, const ErrInvalidDepth = "invalid nesting depth" #0
-pkg regexp/syntax, const ErrInvalidDepth ErrorCode #0
-

diff --git a/doc/go1.19.html b/doc/go1.19.html
index a813d59..51b5a54 100644
--- a/doc/go1.19.html
+++ b/doc/go1.19.html

@@ -92,6 +92,16 @@
   TODO: complete this section
 </p>
 
+<dl id="crypto/tls"><dt><a href="/pkg/crypto/tls/">crypto/tls</a></dt>
+  <dd>
+    <p><!-- CL 400974 -->
+      The <code>tls10default</code> <code>GODEBUG</code> option has been
+      removed. It is still possible to enable TLS 1.0 client-side by setting
+      <code>Config.MinVersion</code>.
+    </p>
+  </dd>
+</dl><!-- crypto/tls -->
+
 <dl id="image/draw"><dt><a href="/pkg/image/draw/">image/draw</a></dt>
   <dd>
     <p><!-- CL 396795 -->
@@ -132,6 +142,21 @@
   </dd>
 </dl><!-- net -->
 
+<dl id="os/exec"><dt><a href="/pkg/os/exec/">os/exec</a></dt>
+  <dd><!-- https://go.dev/issue/50599 -->
+    <p>
+      An <code>exec.Cmd</code> with a non-empty <code>Dir</code> and a
+      nil <code>Env</code> now implicitly sets the <code>PWD</code> environment
+      variable for the subprocess to match <code>Dir</code>.
+    </p>
+    <p>
+      The new method <code>(*exec.Cmd).Environ</code> reports the
+      environment that would be used to run the command, including the
+      aforementioned <code>PWD</code> variable.
+    </p>
+  </dd>
+</dl> <!-- os/exec -->
+
 <dl id="runtime"><dt><a href="/pkg/runtime/">runtime</a></dt>
   <dd>
     <p><!-- https://go.dev/issue/51461 -->

diff --git a/doc/go_spec.html b/doc/go_spec.html
index b496e9e..b272cb5 100644
--- a/doc/go_spec.html
+++ b/doc/go_spec.html

@@ -1,6 +1,6 @@
 <!--{
 	"Title": "The Go Programming Language Specification",
-	"Subtitle": "Version of March 30, 2022",
+	"Subtitle": "Version of April 19, 2022",
 	"Path": "/ref/spec"
 }-->
 
@@ -1278,7 +1278,8 @@
 <p>
 Every type that is a member of the type set of an interface implements that interface.
 Any given type may implement several distinct interfaces.
-For instance, all types implement the <i>empty interface</i> which stands for the set of all types:
+For instance, all types implement the <i>empty interface</i> which stands for the set
+of all (non-interface) types:
 </p>
 
 <pre>
@@ -1380,7 +1381,7 @@
 		of its interface elements.
 	</li>
 
-	<li>The type set of a method specification is the set of types
+	<li>The type set of a method specification is the set of all non-interface types
 		whose method sets include that method.
 	</li>
 
@@ -1389,7 +1390,7 @@
 	</li>
 
 	<li>The type set of a term of the form <code>~T</code>
-		is the set of types whose underlying type is <code>T</code>.
+		is the set of all types whose underlying type is <code>T</code>.
 	</li>
 
 	<li>The type set of a <i>union</i> of terms
@@ -1399,6 +1400,15 @@
 </ul>
 
 <p>
+The quantification "the set of all non-interface types" refers not just to all (non-interface)
+types declared in the program at hand, but all possible types in all possible programs, and
+hence is infinite.
+Similarly, given the set of all non-interface types that implement a particular method, the
+intersection of the method sets of those types will contain exactly that method, even if all
+types in the program at hand always pair that method with another method.
+</p>
+
+<p>
 By construction, an interface's type set never contains an interface type.
 </p>
 

diff --git a/misc/cgo/test/callback_c.c b/misc/cgo/test/callback_c.c
index 8921b73..8ecf70f 100644
--- a/misc/cgo/test/callback_c.c
+++ b/misc/cgo/test/callback_c.c

@@ -3,8 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include <string.h>
-#include <sys/types.h>
-#include <unistd.h>
+
 #include "_cgo_export.h"
 
 void
@@ -31,32 +30,10 @@
 	BackIntoGo();
 }
 
-#ifdef WIN32
-#include <windows.h>
-long long
-mysleep(int seconds) {
-	long long st = GetTickCount();
-	Sleep(1000 * seconds);
-	return st;
-}
-#else
-#include <sys/time.h>
-long long
-mysleep(int seconds) {
-	long long st;
-	struct timeval tv;
-	gettimeofday(&tv, NULL);
-	st = tv.tv_sec * 1000 + tv.tv_usec / 1000;
-	sleep(seconds);
-	return st;
-}
-#endif
-
-long long
-twoSleep(int n)
+void
+Issue1560InC(void)
 {
-	BackgroundSleep(n);
-	return mysleep(n);
+	Issue1560FromC();
 }
 
 void

diff --git a/misc/cgo/test/cgo_test.go b/misc/cgo/test/cgo_test.go
index 774277e..dee6164 100644
--- a/misc/cgo/test/cgo_test.go
+++ b/misc/cgo/test/cgo_test.go

@@ -11,6 +11,7 @@
 // These wrappers are here for gotest to find.
 
 func Test1328(t *testing.T)                  { test1328(t) }
+func Test1560(t *testing.T)                  { test1560(t) }
 func Test1635(t *testing.T)                  { test1635(t) }
 func Test3250(t *testing.T)                  { test3250(t) }
 func Test3729(t *testing.T)                  { test3729(t) }
@@ -89,7 +90,6 @@
 func TestMultipleAssign(t *testing.T)        { testMultipleAssign(t) }
 func TestNaming(t *testing.T)                { testNaming(t) }
 func TestPanicFromC(t *testing.T)            { testPanicFromC(t) }
-func TestParallelSleep(t *testing.T)         { testParallelSleep(t) }
 func TestPrintf(t *testing.T)                { testPrintf(t) }
 func TestReturnAfterGrow(t *testing.T)       { testReturnAfterGrow(t) }
 func TestReturnAfterGrowFromGo(t *testing.T) { testReturnAfterGrowFromGo(t) }

diff --git a/misc/cgo/test/testx.go b/misc/cgo/test/testx.go
index 8ec84a8..6a8e97d 100644
--- a/misc/cgo/test/testx.go
+++ b/misc/cgo/test/testx.go

@@ -18,7 +18,6 @@
 	"sync"
 	"sync/atomic"
 	"testing"
-	"time"
 	"unsafe"
 )
 
@@ -30,8 +29,7 @@
 void IntoC(void);
 
 // issue 1560
-// mysleep returns the absolute start time in ms.
-long long mysleep(int seconds);
+extern void Issue1560InC(void);
 
 // twoSleep returns the absolute start time of the first sleep
 // in ms.
@@ -183,35 +181,40 @@
 }
 
 // issue 1560
+// Test that C functions and Go functions run in parallel.
 
-var sleepDone = make(chan int64)
+var (
+	issue1560 int32
 
-// parallelSleep returns the absolute difference between the start time
-// of the two sleeps.
-func parallelSleep(n int) int64 {
-	t := int64(C.twoSleep(C.int(n))) - <-sleepDone
-	if t < 0 {
-		return -t
+	issue1560Ch = make(chan bool, 2)
+)
+
+//export Issue1560FromC
+func Issue1560FromC() {
+	for atomic.LoadInt32(&issue1560) != 1 {
+		runtime.Gosched()
 	}
-	return t
+	atomic.AddInt32(&issue1560, 1)
+	for atomic.LoadInt32(&issue1560) != 3 {
+		runtime.Gosched()
+	}
+	issue1560Ch <- true
 }
 
-//export BackgroundSleep
-func BackgroundSleep(n int32) {
-	go func() {
-		sleepDone <- int64(C.mysleep(C.int(n)))
-	}()
+func Issue1560FromGo() {
+	atomic.AddInt32(&issue1560, 1)
+	for atomic.LoadInt32(&issue1560) != 2 {
+		runtime.Gosched()
+	}
+	atomic.AddInt32(&issue1560, 1)
+	issue1560Ch <- true
 }
 
-func testParallelSleep(t *testing.T) {
-	sleepSec := 1
-	dt := time.Duration(parallelSleep(sleepSec)) * time.Millisecond
-	t.Logf("difference in start time for two sleep(%d) is %v", sleepSec, dt)
-	// bug used to run sleeps in serial, producing a 2*sleepSec-second delay.
-	// we detect if the start times of those sleeps are > 0.5*sleepSec-second.
-	if dt >= time.Duration(sleepSec)*time.Second/2 {
-		t.Fatalf("parallel %d-second sleeps slept for %f seconds", sleepSec, dt.Seconds())
-	}
+func test1560(t *testing.T) {
+	go Issue1560FromGo()
+	go C.Issue1560InC()
+	<-issue1560Ch
+	<-issue1560Ch
 }
 
 // issue 2462

diff --git a/src/bootstrap.bash b/src/bootstrap.bash
index 88c080a..4038eaf 100755
--- a/src/bootstrap.bash
+++ b/src/bootstrap.bash

@@ -96,7 +96,7 @@
 	echo "Preparing to generate build system's ${OUTGZ}; cleaning ..."
 	rm -rf bin/gofmt
 	rm -rf src/runtime/race/race_*.syso
-	rm -rf api test doc misc/cgo/test misc/trace
+	rm -rf api test doc misc/cgo/test
 	rm -rf pkg/tool/*_*/{addr2line,api,cgo,cover,doc,fix,nm,objdump,pack,pprof,test2json,trace,vet}
 	rm -rf pkg/*_*/{image,database,cmd}
 	rm -rf $(find . -type d -name testdata)

diff --git a/src/bufio/bufio.go b/src/bufio/bufio.go
index 7483946..bcc273c 100644
--- a/src/bufio/bufio.go
+++ b/src/bufio/bufio.go

@@ -731,13 +731,28 @@
 // If the count is less than len(s), it also returns an error explaining
 // why the write is short.
 func (b *Writer) WriteString(s string) (int, error) {
+	var sw io.StringWriter
+	tryStringWriter := true
+
 	nn := 0
 	for len(s) > b.Available() && b.err == nil {
-		n := copy(b.buf[b.n:], s)
-		b.n += n
+		var n int
+		if b.Buffered() == 0 && sw == nil && tryStringWriter {
+			// Check at most once whether b.wr is a StringWriter.
+			sw, tryStringWriter = b.wr.(io.StringWriter)
+		}
+		if b.Buffered() == 0 && tryStringWriter {
+			// Large write, empty buffer, and the underlying writer supports
+			// WriteString: forward the write to the underlying StringWriter.
+			// This avoids an extra copy.
+			n, b.err = sw.WriteString(s)
+		} else {
+			n = copy(b.buf[b.n:], s)
+			b.n += n
+			b.Flush()
+		}
 		nn += n
 		s = s[n:]
-		b.Flush()
 	}
 	if b.err != nil {
 		return nn, b.err

diff --git a/src/bufio/bufio_test.go b/src/bufio/bufio_test.go
index ff3396e..b3456d2 100644
--- a/src/bufio/bufio_test.go
+++ b/src/bufio/bufio_test.go

@@ -762,6 +762,67 @@
 	}
 }
 
+func TestWriteStringStringWriter(t *testing.T) {
+	const BufSize = 8
+	{
+		tw := &teststringwriter{}
+		b := NewWriterSize(tw, BufSize)
+		b.WriteString("1234")
+		tw.check(t, "", "")
+		b.WriteString("56789012")   // longer than BufSize
+		tw.check(t, "12345678", "") // but not enough (after filling the partially-filled buffer)
+		b.Flush()
+		tw.check(t, "123456789012", "")
+	}
+	{
+		tw := &teststringwriter{}
+		b := NewWriterSize(tw, BufSize)
+		b.WriteString("123456789")   // long string, empty buffer:
+		tw.check(t, "", "123456789") // use WriteString
+	}
+	{
+		tw := &teststringwriter{}
+		b := NewWriterSize(tw, BufSize)
+		b.WriteString("abc")
+		tw.check(t, "", "")
+		b.WriteString("123456789012345")      // long string, non-empty buffer
+		tw.check(t, "abc12345", "6789012345") // use Write and then WriteString since the remaining part is still longer than BufSize
+	}
+	{
+		tw := &teststringwriter{}
+		b := NewWriterSize(tw, BufSize)
+		b.Write([]byte("abc")) // same as above, but use Write instead of WriteString
+		tw.check(t, "", "")
+		b.WriteString("123456789012345")
+		tw.check(t, "abc12345", "6789012345") // same as above
+	}
+}
+
+type teststringwriter struct {
+	write       string
+	writeString string
+}
+
+func (w *teststringwriter) Write(b []byte) (int, error) {
+	w.write += string(b)
+	return len(b), nil
+}
+
+func (w *teststringwriter) WriteString(s string) (int, error) {
+	w.writeString += s
+	return len(s), nil
+}
+
+func (w *teststringwriter) check(t *testing.T, write, writeString string) {
+	t.Helper()
+	if w.write != write {
+		t.Errorf("write: expected %q, got %q", write, w.write)
+	}
+	if w.writeString != writeString {
+		t.Errorf("writeString: expected %q, got %q", writeString, w.writeString)
+	}
+}
+
 func TestBufferFull(t *testing.T) {
 	const longString = "And now, hello, world! It is the time for all good men to come to the aid of their party"
 	buf := NewReaderSize(strings.NewReader(longString), minReadBufferSize)

diff --git a/src/cmd/compile/abi-internal.md b/src/cmd/compile/abi-internal.md
index be47e9a..14464ed 100644
--- a/src/cmd/compile/abi-internal.md
+++ b/src/cmd/compile/abi-internal.md

@@ -32,19 +32,19 @@
 Those that aren't guaranteed may change in future versions of Go (for
 example, we've considered changing the alignment of int64 on 32-bit).
 
-| Type | 64-bit |       | 32-bit |       |
-| ---  | ---    | ---   | ---    | ---   |
-|      | Size   | Align | Size   | Align |
-| bool, uint8, int8  | 1  | 1 | 1  | 1 |
-| uint16, int16      | 2  | 2 | 2  | 2 |
-| uint32, int32      | 4  | 4 | 4  | 4 |
-| uint64, int64      | 8  | 8 | 8  | 4 |
-| int, uint          | 8  | 8 | 4  | 4 |
-| float32            | 4  | 4 | 4  | 4 |
-| float64            | 8  | 8 | 8  | 4 |
-| complex64          | 8  | 4 | 8  | 4 |
-| complex128         | 16 | 8 | 16 | 4 |
-| uintptr, *T, unsafe.Pointer | 8 | 8 | 4 | 4 |
+| Type                        | 64-bit |       | 32-bit |       |
+|-----------------------------|--------|-------|--------|-------|
+|                             | Size   | Align | Size   | Align |
+| bool, uint8, int8           | 1      | 1     | 1      | 1     |
+| uint16, int16               | 2      | 2     | 2      | 2     |
+| uint32, int32               | 4      | 4     | 4      | 4     |
+| uint64, int64               | 8      | 8     | 8      | 4     |
+| int, uint                   | 8      | 8     | 4      | 4     |
+| float32                     | 4      | 4     | 4      | 4     |
+| float64                     | 8      | 8     | 8      | 4     |
+| complex64                   | 8      | 4     | 8      | 4     |
+| complex128                  | 16     | 8     | 16     | 4     |
+| uintptr, *T, unsafe.Pointer | 8      | 8     | 4      | 4     |
 
 The types `byte` and `rune` are aliases for `uint8` and `int32`,
 respectively, and hence have the same size and alignment as these

diff --git a/src/cmd/compile/doc.go b/src/cmd/compile/doc.go
index ef7fa86..b8862f6 100644
--- a/src/cmd/compile/doc.go
+++ b/src/cmd/compile/doc.go

@@ -219,11 +219,13 @@
 	//go:uintptrescapes
 
 The //go:uintptrescapes directive must be followed by a function declaration.
-It specifies that the function's uintptr arguments may be pointer values
-that have been converted to uintptr and must be treated as such by the
-garbage collector. The conversion from pointer to uintptr must appear in
-the argument list of any call to this function. This directive is necessary
-for some low-level system call implementations and should be avoided otherwise.
+It specifies that the function's uintptr arguments may be pointer values that
+have been converted to uintptr and must be on the heap and kept alive for the
+duration of the call, even though from the types alone it would appear that the
+object is no longer needed during the call. The conversion from pointer to
+uintptr must appear in the argument list of any call to this function. This
+directive is necessary for some low-level system call implementations and
+should be avoided otherwise.
 
 	//go:noinline
 

diff --git a/src/cmd/compile/internal/compare/compare.go b/src/cmd/compile/internal/compare/compare.go
new file mode 100644
index 0000000..c0017b1
--- /dev/null
+++ b/src/cmd/compile/internal/compare/compare.go

@@ -0,0 +1,272 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package compare contains code for generating comparison
+// routines for structs, strings and interfaces.
+package compare
+
+import (
+	"cmd/compile/internal/base"
+	"cmd/compile/internal/ir"
+	"cmd/compile/internal/typecheck"
+	"cmd/compile/internal/types"
+	"fmt"
+	"math/bits"
+	"sort"
+)
+
+// IsRegularMemory reports whether t can be compared/hashed as regular memory.
+func IsRegularMemory(t *types.Type) bool {
+	a, _ := types.AlgType(t)
+	return a == types.AMEM
+}
+
+// Memrun finds runs of struct fields for which memory-only algs are appropriate.
+// t is the parent struct type, and start is the field index at which to start the run.
+// size is the length in bytes of the memory included in the run.
+// next is the index just after the end of the memory run.
+func Memrun(t *types.Type, start int) (size int64, next int) {
+	next = start
+	for {
+		next++
+		if next == t.NumFields() {
+			break
+		}
+		// Stop run after a padded field.
+		if types.IsPaddedField(t, next-1) {
+			break
+		}
+		// Also, stop before a blank or non-memory field.
+		if f := t.Field(next); f.Sym.IsBlank() || !IsRegularMemory(f.Type) {
+			break
+		}
+		// For issue 46283, don't combine fields if the resulting load would
+		// require a larger alignment than the component fields.
+		if base.Ctxt.Arch.Alignment > 1 {
+			align := t.Alignment()
+			if off := t.Field(start).Offset; off&(align-1) != 0 {
+				// Offset is less aligned than the containing type.
+				// Use offset to determine alignment.
+				align = 1 << uint(bits.TrailingZeros64(uint64(off)))
+			}
+			size := t.Field(next).End() - t.Field(start).Offset
+			if size > align {
+				break
+			}
+		}
+	}
+	return t.Field(next-1).End() - t.Field(start).Offset, next
+}
+
+// EqCanPanic reports whether == on type t could panic (has an interface somewhere).
+// t must be comparable.
+func EqCanPanic(t *types.Type) bool {
+	switch t.Kind() {
+	default:
+		return false
+	case types.TINTER:
+		return true
+	case types.TARRAY:
+		return EqCanPanic(t.Elem())
+	case types.TSTRUCT:
+		for _, f := range t.FieldSlice() {
+			if !f.Sym.IsBlank() && EqCanPanic(f.Type) {
+				return true
+			}
+		}
+		return false
+	}
+}
+
+// EqStruct compares two structs np and nq for equality.
+// It works by building a list of boolean conditions to satisfy.
+// Conditions must be evaluated in the returned order and
+// properly short circuited by the caller.
+func EqStruct(t *types.Type, np, nq ir.Node) []ir.Node {
+	// The conditions are a list-of-lists. Conditions are reorderable
+	// within each inner list. The outer lists must be evaluated in order.
+	var conds [][]ir.Node
+	conds = append(conds, []ir.Node{})
+	and := func(n ir.Node) {
+		i := len(conds) - 1
+		conds[i] = append(conds[i], n)
+	}
+
+	// Walk the struct using memequal for runs of AMEM
+	// and calling specific equality tests for the others.
+	for i, fields := 0, t.FieldSlice(); i < len(fields); {
+		f := fields[i]
+
+		// Skip blank-named fields.
+		if f.Sym.IsBlank() {
+			i++
+			continue
+		}
+
+		// Compare non-memory fields with field equality.
+		if !IsRegularMemory(f.Type) {
+			if EqCanPanic(f.Type) {
+				// Enforce ordering by starting a new set of reorderable conditions.
+				conds = append(conds, []ir.Node{})
+			}
+			p := ir.NewSelectorExpr(base.Pos, ir.OXDOT, np, f.Sym)
+			q := ir.NewSelectorExpr(base.Pos, ir.OXDOT, nq, f.Sym)
+			switch {
+			case f.Type.IsString():
+				eqlen, eqmem := EqString(p, q)
+				and(eqlen)
+				and(eqmem)
+			default:
+				and(ir.NewBinaryExpr(base.Pos, ir.OEQ, p, q))
+			}
+			if EqCanPanic(f.Type) {
+				// Also enforce ordering after something that can panic.
+				conds = append(conds, []ir.Node{})
+			}
+			i++
+			continue
+		}
+
+		// Find maximal length run of memory-only fields.
+		size, next := Memrun(t, i)
+
+		// TODO(rsc): All the calls to newname are wrong for
+		// cross-package unexported fields.
+		if s := fields[i:next]; len(s) <= 2 {
+			// Two or fewer fields: use plain field equality.
+			for _, f := range s {
+				and(eqfield(np, nq, ir.OEQ, f.Sym))
+			}
+		} else {
+			// More than two fields: use memequal.
+			cc := eqmem(np, nq, f.Sym, size)
+			and(cc)
+		}
+		i = next
+	}
+
+	// Sort conditions to put runtime calls last.
+	// Preserve the rest of the ordering.
+	var flatConds []ir.Node
+	for _, c := range conds {
+		isCall := func(n ir.Node) bool {
+			return n.Op() == ir.OCALL || n.Op() == ir.OCALLFUNC
+		}
+		sort.SliceStable(c, func(i, j int) bool {
+			return !isCall(c[i]) && isCall(c[j])
+		})
+		flatConds = append(flatConds, c...)
+	}
+	return flatConds
+}
+
+// EqString returns the nodes
+//
+//	len(s) == len(t)
+//
+// and
+//
+//	memequal(s.ptr, t.ptr, len(s))
+//
+// which can be used to construct string equality comparison.
+// eqlen must be evaluated before eqmem, and shortcircuiting is required.
+func EqString(s, t ir.Node) (eqlen *ir.BinaryExpr, eqmem *ir.CallExpr) {
+	s = typecheck.Conv(s, types.Types[types.TSTRING])
+	t = typecheck.Conv(t, types.Types[types.TSTRING])
+	sptr := ir.NewUnaryExpr(base.Pos, ir.OSPTR, s)
+	tptr := ir.NewUnaryExpr(base.Pos, ir.OSPTR, t)
+	slen := typecheck.Conv(ir.NewUnaryExpr(base.Pos, ir.OLEN, s), types.Types[types.TUINTPTR])
+	tlen := typecheck.Conv(ir.NewUnaryExpr(base.Pos, ir.OLEN, t), types.Types[types.TUINTPTR])
+
+	fn := typecheck.LookupRuntime("memequal")
+	fn = typecheck.SubstArgTypes(fn, types.Types[types.TUINT8], types.Types[types.TUINT8])
+	call := typecheck.Call(base.Pos, fn, []ir.Node{sptr, tptr, ir.Copy(slen)}, false).(*ir.CallExpr)
+
+	cmp := ir.NewBinaryExpr(base.Pos, ir.OEQ, slen, tlen)
+	cmp = typecheck.Expr(cmp).(*ir.BinaryExpr)
+	cmp.SetType(types.Types[types.TBOOL])
+	return cmp, call
+}
+
+// EqInterface returns the nodes
+//
+//	s.tab == t.tab (or s.typ == t.typ, as appropriate)
+//
+// and
+//
+//	ifaceeq(s.tab, s.data, t.data) (or efaceeq(s.typ, s.data, t.data), as appropriate)
+//
+// which can be used to construct interface equality comparison.
+// eqtab must be evaluated before eqdata, and shortcircuiting is required.
+func EqInterface(s, t ir.Node) (eqtab *ir.BinaryExpr, eqdata *ir.CallExpr) {
+	if !types.Identical(s.Type(), t.Type()) {
+		base.Fatalf("EqInterface %v %v", s.Type(), t.Type())
+	}
+	// func ifaceeq(tab *uintptr, x, y unsafe.Pointer) (ret bool)
+	// func efaceeq(typ *uintptr, x, y unsafe.Pointer) (ret bool)
+	var fn ir.Node
+	if s.Type().IsEmptyInterface() {
+		fn = typecheck.LookupRuntime("efaceeq")
+	} else {
+		fn = typecheck.LookupRuntime("ifaceeq")
+	}
+
+	stab := ir.NewUnaryExpr(base.Pos, ir.OITAB, s)
+	ttab := ir.NewUnaryExpr(base.Pos, ir.OITAB, t)
+	sdata := ir.NewUnaryExpr(base.Pos, ir.OIDATA, s)
+	tdata := ir.NewUnaryExpr(base.Pos, ir.OIDATA, t)
+	sdata.SetType(types.Types[types.TUNSAFEPTR])
+	tdata.SetType(types.Types[types.TUNSAFEPTR])
+	sdata.SetTypecheck(1)
+	tdata.SetTypecheck(1)
+
+	call := typecheck.Call(base.Pos, fn, []ir.Node{stab, sdata, tdata}, false).(*ir.CallExpr)
+
+	cmp := ir.NewBinaryExpr(base.Pos, ir.OEQ, stab, ttab)
+	cmp = typecheck.Expr(cmp).(*ir.BinaryExpr)
+	cmp.SetType(types.Types[types.TBOOL])
+	return cmp, call
+}
+
+// eqfield returns the node
+//
+//	p.field == q.field
+func eqfield(p ir.Node, q ir.Node, op ir.Op, field *types.Sym) ir.Node {
+	nx := ir.NewSelectorExpr(base.Pos, ir.OXDOT, p, field)
+	ny := ir.NewSelectorExpr(base.Pos, ir.OXDOT, q, field)
+	ne := ir.NewBinaryExpr(base.Pos, op, nx, ny)
+	return ne
+}
+
+// eqmem returns the node
+//
+//	memequal(&p.field, &q.field, size])
+func eqmem(p ir.Node, q ir.Node, field *types.Sym, size int64) ir.Node {
+	nx := typecheck.Expr(typecheck.NodAddr(ir.NewSelectorExpr(base.Pos, ir.OXDOT, p, field)))
+	ny := typecheck.Expr(typecheck.NodAddr(ir.NewSelectorExpr(base.Pos, ir.OXDOT, q, field)))
+
+	fn, needsize := eqmemfunc(size, nx.Type().Elem())
+	call := ir.NewCallExpr(base.Pos, ir.OCALL, fn, nil)
+	call.Args.Append(nx)
+	call.Args.Append(ny)
+	if needsize {
+		call.Args.Append(ir.NewInt(size))
+	}
+
+	return call
+}
+
+func eqmemfunc(size int64, t *types.Type) (fn *ir.Name, needsize bool) {
+	switch size {
+	default:
+		fn = typecheck.LookupRuntime("memequal")
+		needsize = true
+	case 1, 2, 4, 8, 16:
+		buf := fmt.Sprintf("memequal%d", int(size)*8)
+		fn = typecheck.LookupRuntime(buf)
+	}
+
+	fn = typecheck.SubstArgTypes(fn, t, t)
+	return fn, needsize
+}

diff --git a/src/cmd/compile/internal/escape/escape.go b/src/cmd/compile/internal/escape/escape.go
index 4408a53..05fbe58 100644
--- a/src/cmd/compile/internal/escape/escape.go
+++ b/src/cmd/compile/internal/escape/escape.go

@@ -422,8 +422,6 @@
 	}
 
 	if fn.Pragma&ir.UintptrEscapes != 0 {
-		fn.Pragma |= ir.UintptrKeepAlive
-
 		if f.Type.IsUintptr() {
 			if diagnose {
 				base.WarnfAt(f.Pos, "marking %v as escaping uintptr", name())

diff --git a/src/cmd/compile/internal/inline/inl.go b/src/cmd/compile/internal/inline/inl.go
index 8c2ea49..486a6ad 100644
--- a/src/cmd/compile/internal/inline/inl.go
+++ b/src/cmd/compile/internal/inline/inl.go

@@ -120,6 +120,17 @@
 		return
 	}
 
+	// If marked as "go:uintptrkeepalive", don't inline, since the
+	// keep alive information is lost during inlining.
+	//
+	// TODO(prattmic): This is handled on calls during escape analysis,
+	// which is after inlining. Move prior to inlining so the keep-alive is
+	// maintained after inlining.
+	if fn.Pragma&ir.UintptrKeepAlive != 0 {
+		reason = "marked as having a keep-alive uintptr argument"
+		return
+	}
+
 	// If marked as "go:uintptrescapes", don't inline, since the
 	// escape information is lost during inlining.
 	if fn.Pragma&ir.UintptrEscapes != 0 {

diff --git a/src/cmd/compile/internal/ir/node.go b/src/cmd/compile/internal/ir/node.go
index 9ccb8e3..0d91d17 100644
--- a/src/cmd/compile/internal/ir/node.go
+++ b/src/cmd/compile/internal/ir/node.go

@@ -459,7 +459,7 @@
 	Noinline                    // func should not be inlined
 	NoCheckPtr                  // func should not be instrumented by checkptr
 	CgoUnsafeArgs               // treat a pointer to one arg as a pointer to them all
-	UintptrKeepAlive            // pointers converted to uintptr must be kept alive (compiler internal only)
+	UintptrKeepAlive            // pointers converted to uintptr must be kept alive
 	UintptrEscapes              // pointers converted to uintptr escape
 
 	// Runtime-only func pragmas.

diff --git a/src/cmd/compile/internal/noder/decl.go b/src/cmd/compile/internal/noder/decl.go
index f985648..91a90d9 100644
--- a/src/cmd/compile/internal/noder/decl.go
+++ b/src/cmd/compile/internal/noder/decl.go

@@ -125,8 +125,26 @@
 		}
 	}
 
-	if decl.Body != nil && fn.Pragma&ir.Noescape != 0 {
-		base.ErrorfAt(fn.Pos(), "can only use //go:noescape with external func implementations")
+	if decl.Body != nil {
+		if fn.Pragma&ir.Noescape != 0 {
+			base.ErrorfAt(fn.Pos(), "can only use //go:noescape with external func implementations")
+		}
+		if (fn.Pragma&ir.UintptrKeepAlive != 0 && fn.Pragma&ir.UintptrEscapes == 0) && fn.Pragma&ir.Nosplit == 0 {
+			// Stack growth can't handle uintptr arguments that may
+			// be pointers (as we don't know which are pointers
+			// when creating the stack map). Thus uintptrkeepalive
+			// functions (and all transitive callees) must be
+			// nosplit.
+			//
+			// N.B. uintptrescapes implies uintptrkeepalive but it
+			// is OK since the arguments must escape to the heap.
+			//
+			// TODO(prattmic): Add recursive nosplit check of callees.
+			// TODO(prattmic): Functions with no body (i.e.,
+			// assembly) must also be nosplit, but we can't check
+			// that here.
+			base.ErrorfAt(fn.Pos(), "go:uintptrkeepalive requires go:nosplit")
+		}
 	}
 
 	if decl.Name.Value == "init" && decl.Recv == nil {

diff --git a/src/cmd/compile/internal/noder/lex.go b/src/cmd/compile/internal/noder/lex.go
index 66a56a5..cef0f08 100644
--- a/src/cmd/compile/internal/noder/lex.go
+++ b/src/cmd/compile/internal/noder/lex.go

@@ -30,6 +30,7 @@
 		ir.NoCheckPtr |
 		ir.RegisterParams | // TODO(register args) remove after register abi is working
 		ir.CgoUnsafeArgs |
+		ir.UintptrKeepAlive |
 		ir.UintptrEscapes |
 		ir.Systemstack |
 		ir.Nowritebarrier |
@@ -67,19 +68,13 @@
 		return ir.Yeswritebarrierrec
 	case "go:cgo_unsafe_args":
 		return ir.CgoUnsafeArgs | ir.NoCheckPtr // implies NoCheckPtr (see #34968)
+	case "go:uintptrkeepalive":
+		return ir.UintptrKeepAlive
 	case "go:uintptrescapes":
-		// For the next function declared in the file
-		// any uintptr arguments may be pointer values
-		// converted to uintptr. This directive
-		// ensures that the referenced allocated
-		// object, if any, is retained and not moved
-		// until the call completes, even though from
-		// the types alone it would appear that the
-		// object is no longer needed during the
-		// call. The conversion to uintptr must appear
-		// in the argument list.
-		// Used in syscall/dll_windows.go.
-		return ir.UintptrEscapes
+		// This directive extends //go:uintptrkeepalive by forcing
+		// uintptr arguments to escape to the heap, which makes stack
+		// growth safe.
+		return ir.UintptrEscapes | ir.UintptrKeepAlive // implies UintptrKeepAlive
 	case "go:registerparams": // TODO(register args) remove after register abi is working
 		return ir.RegisterParams
 	case "go:notinheap":

diff --git a/src/cmd/compile/internal/noder/noder.go b/src/cmd/compile/internal/noder/noder.go
index cc5610a..9a42b5a 100644
--- a/src/cmd/compile/internal/noder/noder.go
+++ b/src/cmd/compile/internal/noder/noder.go

@@ -340,6 +340,9 @@
 		if !base.Flag.CompilingRuntime && flag&runtimePragmas != 0 {
 			p.error(syntax.Error{Pos: pos, Msg: fmt.Sprintf("//%s only allowed in runtime", verb)})
 		}
+		if flag == ir.UintptrKeepAlive && !base.Flag.Std {
+			p.error(syntax.Error{Pos: pos, Msg: fmt.Sprintf("//%s is only allowed in the standard library", verb)})
+		}
 		if flag == 0 && !allowedStdPragmas[verb] && base.Flag.Std {
 			p.error(syntax.Error{Pos: pos, Msg: fmt.Sprintf("//%s is not allowed in the standard library", verb)})
 		}

diff --git a/src/cmd/compile/internal/noder/writer.go b/src/cmd/compile/internal/noder/writer.go
index c5c346b..0fb162d 100644
--- a/src/cmd/compile/internal/noder/writer.go
+++ b/src/cmd/compile/internal/noder/writer.go

@@ -742,6 +742,22 @@
 		if pragma&ir.Noescape != 0 {
 			w.p.errorf(decl, "can only use //go:noescape with external func implementations")
 		}
+		if (pragma&ir.UintptrKeepAlive != 0 && pragma&ir.UintptrEscapes == 0) && pragma&ir.Nosplit == 0 {
+			// Stack growth can't handle uintptr arguments that may
+			// be pointers (as we don't know which are pointers
+			// when creating the stack map). Thus uintptrkeepalive
+			// functions (and all transitive callees) must be
+			// nosplit.
+			//
+			// N.B. uintptrescapes implies uintptrkeepalive but it
+			// is OK since the arguments must escape to the heap.
+			//
+			// TODO(prattmic): Add recursive nosplit check of callees.
+			// TODO(prattmic): Functions with no body (i.e.,
+			// assembly) must also be nosplit, but we can't check
+			// that here.
+			w.p.errorf(decl, "go:uintptrkeepalive requires go:nosplit")
+		}
 	} else {
 		if base.Flag.Complete || decl.Name.Value == "init" {
 			// Linknamed functions are allowed to have no body. Hopefully

diff --git a/src/cmd/compile/internal/reflectdata/alg.go b/src/cmd/compile/internal/reflectdata/alg.go
index 9fe90da..de23387 100644
--- a/src/cmd/compile/internal/reflectdata/alg.go
+++ b/src/cmd/compile/internal/reflectdata/alg.go

@@ -6,10 +6,9 @@
 
 import (
 	"fmt"
-	"math/bits"
-	"sort"
 
 	"cmd/compile/internal/base"
+	"cmd/compile/internal/compare"
 	"cmd/compile/internal/ir"
 	"cmd/compile/internal/objw"
 	"cmd/compile/internal/typecheck"
@@ -17,32 +16,6 @@
 	"cmd/internal/obj"
 )
 
-// isRegularMemory reports whether t can be compared/hashed as regular memory.
-func isRegularMemory(t *types.Type) bool {
-	a, _ := types.AlgType(t)
-	return a == types.AMEM
-}
-
-// eqCanPanic reports whether == on type t could panic (has an interface somewhere).
-// t must be comparable.
-func eqCanPanic(t *types.Type) bool {
-	switch t.Kind() {
-	default:
-		return false
-	case types.TINTER:
-		return true
-	case types.TARRAY:
-		return eqCanPanic(t.Elem())
-	case types.TSTRUCT:
-		for _, f := range t.FieldSlice() {
-			if !f.Sym.IsBlank() && eqCanPanic(f.Type) {
-				return true
-			}
-		}
-		return false
-	}
-}
-
 // AlgType returns the fixed-width AMEMxx variants instead of the general
 // AMEM kind when possible.
 func AlgType(t *types.Type) types.AlgKind {
@@ -206,7 +179,7 @@
 			}
 
 			// Hash non-memory fields with appropriate hash function.
-			if !isRegularMemory(f.Type) {
+			if !compare.IsRegularMemory(f.Type) {
 				hashel := hashfor(f.Type)
 				call := ir.NewCallExpr(base.Pos, ir.OCALL, hashel, nil)
 				nx := ir.NewSelectorExpr(base.Pos, ir.OXDOT, np, f.Sym) // TODO: fields from other packages?
@@ -219,7 +192,7 @@
 			}
 
 			// Otherwise, hash a maximal length run of raw memory.
-			size, next := memrun(t, i)
+			size, next := compare.Memrun(t, i)
 
 			// h = hashel(&p.first, size, h)
 			hashel := hashmem(f.Type)
@@ -510,12 +483,12 @@
 			// Second, check that all the contents match (expensive).
 			checkAll(3, false, func(pi, qi ir.Node) ir.Node {
 				// Compare lengths.
-				eqlen, _ := EqString(pi, qi)
+				eqlen, _ := compare.EqString(pi, qi)
 				return eqlen
 			})
 			checkAll(1, true, func(pi, qi ir.Node) ir.Node {
 				// Compare contents.
-				_, eqmem := EqString(pi, qi)
+				_, eqmem := compare.EqString(pi, qi)
 				return eqmem
 			})
 		case types.TFLOAT32, types.TFLOAT64:
@@ -532,81 +505,7 @@
 		}
 
 	case types.TSTRUCT:
-		// Build a list of conditions to satisfy.
-		// The conditions are a list-of-lists. Conditions are reorderable
-		// within each inner list. The outer lists must be evaluated in order.
-		var conds [][]ir.Node
-		conds = append(conds, []ir.Node{})
-		and := func(n ir.Node) {
-			i := len(conds) - 1
-			conds[i] = append(conds[i], n)
-		}
-
-		// Walk the struct using memequal for runs of AMEM
-		// and calling specific equality tests for the others.
-		for i, fields := 0, t.FieldSlice(); i < len(fields); {
-			f := fields[i]
-
-			// Skip blank-named fields.
-			if f.Sym.IsBlank() {
-				i++
-				continue
-			}
-
-			// Compare non-memory fields with field equality.
-			if !isRegularMemory(f.Type) {
-				if eqCanPanic(f.Type) {
-					// Enforce ordering by starting a new set of reorderable conditions.
-					conds = append(conds, []ir.Node{})
-				}
-				p := ir.NewSelectorExpr(base.Pos, ir.OXDOT, np, f.Sym)
-				q := ir.NewSelectorExpr(base.Pos, ir.OXDOT, nq, f.Sym)
-				switch {
-				case f.Type.IsString():
-					eqlen, eqmem := EqString(p, q)
-					and(eqlen)
-					and(eqmem)
-				default:
-					and(ir.NewBinaryExpr(base.Pos, ir.OEQ, p, q))
-				}
-				if eqCanPanic(f.Type) {
-					// Also enforce ordering after something that can panic.
-					conds = append(conds, []ir.Node{})
-				}
-				i++
-				continue
-			}
-
-			// Find maximal length run of memory-only fields.
-			size, next := memrun(t, i)
-
-			// TODO(rsc): All the calls to newname are wrong for
-			// cross-package unexported fields.
-			if s := fields[i:next]; len(s) <= 2 {
-				// Two or fewer fields: use plain field equality.
-				for _, f := range s {
-					and(eqfield(np, nq, f.Sym))
-				}
-			} else {
-				// More than two fields: use memequal.
-				and(eqmem(np, nq, f.Sym, size))
-			}
-			i = next
-		}
-
-		// Sort conditions to put runtime calls last.
-		// Preserve the rest of the ordering.
-		var flatConds []ir.Node
-		for _, c := range conds {
-			isCall := func(n ir.Node) bool {
-				return n.Op() == ir.OCALL || n.Op() == ir.OCALLFUNC
-			}
-			sort.SliceStable(c, func(i, j int) bool {
-				return !isCall(c[i]) && isCall(c[j])
-			})
-			flatConds = append(flatConds, c...)
-		}
-
+		flatConds := compare.EqStruct(t, np, nq)
 		if len(flatConds) == 0 {
 			fn.Body.Append(ir.NewAssignStmt(base.Pos, nr, ir.NewBool(true)))
 		} else {
@@ -631,7 +530,7 @@
 	//   return (or goto ret)
 	fn.Body.Append(ir.NewLabelStmt(base.Pos, neq))
 	fn.Body.Append(ir.NewAssignStmt(base.Pos, nr, ir.NewBool(false)))
-	if eqCanPanic(t) || anyCall(fn) {
+	if compare.EqCanPanic(t) || anyCall(fn) {
 		// Epilogue is large, so share it with the equal case.
 		fn.Body.Append(ir.NewBranchStmt(base.Pos, ir.OGOTO, ret))
 	} else {
@@ -680,153 +579,6 @@
 	})
 }
 
-// eqfield returns the node
-//
-//	p.field == q.field
-func eqfield(p ir.Node, q ir.Node, field *types.Sym) ir.Node {
-	nx := ir.NewSelectorExpr(base.Pos, ir.OXDOT, p, field)
-	ny := ir.NewSelectorExpr(base.Pos, ir.OXDOT, q, field)
-	ne := ir.NewBinaryExpr(base.Pos, ir.OEQ, nx, ny)
-	return ne
-}
-
-// EqString returns the nodes
-//
-//	len(s) == len(t)
-//
-// and
-//
-//	memequal(s.ptr, t.ptr, len(s))
-//
-// which can be used to construct string equality comparison.
-// eqlen must be evaluated before eqmem, and shortcircuiting is required.
-func EqString(s, t ir.Node) (eqlen *ir.BinaryExpr, eqmem *ir.CallExpr) {
-	s = typecheck.Conv(s, types.Types[types.TSTRING])
-	t = typecheck.Conv(t, types.Types[types.TSTRING])
-	sptr := ir.NewUnaryExpr(base.Pos, ir.OSPTR, s)
-	tptr := ir.NewUnaryExpr(base.Pos, ir.OSPTR, t)
-	slen := typecheck.Conv(ir.NewUnaryExpr(base.Pos, ir.OLEN, s), types.Types[types.TUINTPTR])
-	tlen := typecheck.Conv(ir.NewUnaryExpr(base.Pos, ir.OLEN, t), types.Types[types.TUINTPTR])
-
-	fn := typecheck.LookupRuntime("memequal")
-	fn = typecheck.SubstArgTypes(fn, types.Types[types.TUINT8], types.Types[types.TUINT8])
-	call := typecheck.Call(base.Pos, fn, []ir.Node{sptr, tptr, ir.Copy(slen)}, false).(*ir.CallExpr)
-
-	cmp := ir.NewBinaryExpr(base.Pos, ir.OEQ, slen, tlen)
-	cmp = typecheck.Expr(cmp).(*ir.BinaryExpr)
-	cmp.SetType(types.Types[types.TBOOL])
-	return cmp, call
-}
-
-// EqInterface returns the nodes
-//
-//	s.tab == t.tab (or s.typ == t.typ, as appropriate)
-//
-// and
-//
-//	ifaceeq(s.tab, s.data, t.data) (or efaceeq(s.typ, s.data, t.data), as appropriate)
-//
-// which can be used to construct interface equality comparison.
-// eqtab must be evaluated before eqdata, and shortcircuiting is required.
-func EqInterface(s, t ir.Node) (eqtab *ir.BinaryExpr, eqdata *ir.CallExpr) {
-	if !types.Identical(s.Type(), t.Type()) {
-		base.Fatalf("EqInterface %v %v", s.Type(), t.Type())
-	}
-	// func ifaceeq(tab *uintptr, x, y unsafe.Pointer) (ret bool)
-	// func efaceeq(typ *uintptr, x, y unsafe.Pointer) (ret bool)
-	var fn ir.Node
-	if s.Type().IsEmptyInterface() {
-		fn = typecheck.LookupRuntime("efaceeq")
-	} else {
-		fn = typecheck.LookupRuntime("ifaceeq")
-	}
-
-	stab := ir.NewUnaryExpr(base.Pos, ir.OITAB, s)
-	ttab := ir.NewUnaryExpr(base.Pos, ir.OITAB, t)
-	sdata := ir.NewUnaryExpr(base.Pos, ir.OIDATA, s)
-	tdata := ir.NewUnaryExpr(base.Pos, ir.OIDATA, t)
-	sdata.SetType(types.Types[types.TUNSAFEPTR])
-	tdata.SetType(types.Types[types.TUNSAFEPTR])
-	sdata.SetTypecheck(1)
-	tdata.SetTypecheck(1)
-
-	call := typecheck.Call(base.Pos, fn, []ir.Node{stab, sdata, tdata}, false).(*ir.CallExpr)
-
-	cmp := ir.NewBinaryExpr(base.Pos, ir.OEQ, stab, ttab)
-	cmp = typecheck.Expr(cmp).(*ir.BinaryExpr)
-	cmp.SetType(types.Types[types.TBOOL])
-	return cmp, call
-}
-
-// eqmem returns the node
-//
-//	memequal(&p.field, &q.field [, size])
-func eqmem(p ir.Node, q ir.Node, field *types.Sym, size int64) ir.Node {
-	nx := typecheck.Expr(typecheck.NodAddr(ir.NewSelectorExpr(base.Pos, ir.OXDOT, p, field)))
-	ny := typecheck.Expr(typecheck.NodAddr(ir.NewSelectorExpr(base.Pos, ir.OXDOT, q, field)))
-
-	fn, needsize := eqmemfunc(size, nx.Type().Elem())
-	call := ir.NewCallExpr(base.Pos, ir.OCALL, fn, nil)
-	call.Args.Append(nx)
-	call.Args.Append(ny)
-	if needsize {
-		call.Args.Append(ir.NewInt(size))
-	}
-
-	return call
-}
-
-func eqmemfunc(size int64, t *types.Type) (fn *ir.Name, needsize bool) {
-	switch size {
-	default:
-		fn = typecheck.LookupRuntime("memequal")
-		needsize = true
-	case 1, 2, 4, 8, 16:
-		buf := fmt.Sprintf("memequal%d", int(size)*8)
-		fn = typecheck.LookupRuntime(buf)
-	}
-
-	fn = typecheck.SubstArgTypes(fn, t, t)
-	return fn, needsize
-}
-
-// memrun finds runs of struct fields for which memory-only algs are appropriate.
-// t is the parent struct type, and start is the field index at which to start the run.
-// size is the length in bytes of the memory included in the run.
-// next is the index just after the end of the memory run.
-func memrun(t *types.Type, start int) (size int64, next int) {
-	next = start
-	for {
-		next++
-		if next == t.NumFields() {
-			break
-		}
-		// Stop run after a padded field.
-		if types.IsPaddedField(t, next-1) {
-			break
-		}
-		// Also, stop before a blank or non-memory field.
-		if f := t.Field(next); f.Sym.IsBlank() || !isRegularMemory(f.Type) {
-			break
-		}
-		// For issue 46283, don't combine fields if the resulting load would
-		// require a larger alignment than the component fields.
-		if base.Ctxt.Arch.Alignment > 1 {
-			align := t.Alignment()
-			if off := t.Field(start).Offset; off&(align-1) != 0 {
-				// Offset is less aligned than the containing type.
-				// Use offset to determine alignment.
-				align = 1 << uint(bits.TrailingZeros64(uint64(off)))
-			}
-			size := t.Field(next).End() - t.Field(start).Offset
-			if size > align {
-				break
-			}
-		}
-	}
-	return t.Field(next-1).End() - t.Field(start).Offset, next
-}
-
 func hashmem(t *types.Type) ir.Node {
 	sym := ir.Pkgs.Runtime.Lookup("memhash")
 

diff --git a/src/cmd/compile/internal/reflectdata/reflect.go b/src/cmd/compile/internal/reflectdata/reflect.go
index affc679..9553b0d 100644
--- a/src/cmd/compile/internal/reflectdata/reflect.go
+++ b/src/cmd/compile/internal/reflectdata/reflect.go

@@ -14,6 +14,7 @@
 
 	"cmd/compile/internal/base"
 	"cmd/compile/internal/bitvec"
+	"cmd/compile/internal/compare"
 	"cmd/compile/internal/escape"
 	"cmd/compile/internal/inline"
 	"cmd/compile/internal/ir"
@@ -728,7 +729,7 @@
 	if t.Sym() != nil && t.Sym().Name != "" {
 		tflag |= tflagNamed
 	}
-	if isRegularMemory(t) {
+	if compare.IsRegularMemory(t) {
 		tflag |= tflagRegularMemory
 	}
 

diff --git a/src/cmd/compile/internal/syntax/parser.go b/src/cmd/compile/internal/syntax/parser.go
index f18d526..a89dcfa 100644
--- a/src/cmd/compile/internal/syntax/parser.go
+++ b/src/cmd/compile/internal/syntax/parser.go

@@ -827,7 +827,7 @@
 	switch p.tok {
 	case _Operator, _Star:
 		switch p.op {
-		case Mul, Add, Sub, Not, Xor:
+		case Mul, Add, Sub, Not, Xor, Tilde:
 			x := new(Operation)
 			x.pos = p.pos()
 			x.Op = p.op
@@ -991,7 +991,7 @@
 	case _Func:
 		pos := p.pos()
 		p.next()
-		_, ftyp := p.funcType("function literal")
+		_, ftyp := p.funcType("function type")
 		if p.tok == _Lbrace {
 			p.xnest++
 
@@ -1499,44 +1499,14 @@
 	p.want(_Interface)
 	p.want(_Lbrace)
 	p.list("interface type", _Semi, _Rbrace, func() bool {
-		switch p.tok {
-		case _Name:
-			f := p.methodDecl()
-			if f.Name == nil {
-				f = p.embeddedElem(f)
-			}
-			typ.MethodList = append(typ.MethodList, f)
-			return false
-
-		case _Lparen:
-			p.syntaxError("cannot parenthesize embedded type")
-			f := new(Field)
-			f.pos = p.pos()
-			p.next()
-			f.Type = p.qualifiedName(nil)
-			p.want(_Rparen)
-			typ.MethodList = append(typ.MethodList, f)
-			return false
-
-		case _Operator:
-			if p.op == Tilde {
-				typ.MethodList = append(typ.MethodList, p.embeddedElem(nil))
-				return false
-			}
-
-		default:
-			pos := p.pos()
-			if t := p.typeOrNil(); t != nil {
-				f := new(Field)
-				f.pos = pos
-				f.Type = t
-				typ.MethodList = append(typ.MethodList, p.embeddedElem(f))
-				return false
-			}
+		var f *Field
+		if p.tok == _Name {
+			f = p.methodDecl()
 		}
-
-		p.syntaxError("expecting method or embedded element")
-		p.advance(_Semi, _Rbrace)
+		if f == nil || f.Name == nil {
+			f = p.embeddedElem(f)
+		}
+		typ.MethodList = append(typ.MethodList, f)
 		return false
 	})
 

diff --git a/src/cmd/compile/internal/syntax/testdata/issue48382.go b/src/cmd/compile/internal/syntax/testdata/issue48382.go
index c00fee6..7c024a0 100644
--- a/src/cmd/compile/internal/syntax/testdata/issue48382.go
+++ b/src/cmd/compile/internal/syntax/testdata/issue48382.go

@@ -8,7 +8,8 @@
 type _ func /* ERROR function type must have no type parameters */ [ x /* ERROR missing type constraint */ ]()
 type _ func /* ERROR function type must have no type parameters */ [P any]()
 
-var _ = func /* ERROR function literal must have no type parameters */ [P any]() {}
+var _ = (func /* ERROR function type must have no type parameters */ [P any]())(nil)
+var _ = func /* ERROR function type must have no type parameters */ [P any]() {}
 
 type _ interface{
         m /* ERROR interface method must have no type parameters */ [P any]()

diff --git a/src/cmd/compile/internal/syntax/testdata/issue52391.go b/src/cmd/compile/internal/syntax/testdata/issue52391.go
new file mode 100644
index 0000000..f2098ce
--- /dev/null
+++ b/src/cmd/compile/internal/syntax/testdata/issue52391.go

@@ -0,0 +1,17 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package p
+
+type _ interface {
+	int
+	(int)
+	(*int)
+	*([]byte)
+	~(int)
+	(int) | (string)
+	(int) | ~(string)
+	(/* ERROR unexpected ~ */ ~int)
+	(int /* ERROR unexpected \| */ | /* ERROR unexpected string */ string /* ERROR unexpected \) */ )
+}

diff --git a/src/cmd/compile/internal/syntax/testdata/typeset.go b/src/cmd/compile/internal/syntax/testdata/typeset.go
index 19b74f2..fe5c3f4 100644
--- a/src/cmd/compile/internal/syntax/testdata/typeset.go
+++ b/src/cmd/compile/internal/syntax/testdata/typeset.go

@@ -65,15 +65,17 @@
 
 // Single-expression type parameter lists and those that don't start
 // with a (type parameter) name are considered array sizes.
-// The term must be a valid expression (it could be a type - and then
-// a type-checker will complain - but we don't allow ~ in the expr).
+// The term must be a valid expression (it could be a type incl. a
+// tilde term) but the type-checker will complain.
 type (
         _[t] t
-        _[/* ERROR unexpected ~ */ ~t] t
         _[t|t] t
-        _[/* ERROR unexpected ~ */ ~t|t] t
-        _[t| /* ERROR unexpected ~ */ ~t] t
-        _[/* ERROR unexpected ~ */ ~t|~t] t
+
+        // These are invalid and the type-checker will complain.
+        _[~t] t
+        _[~t|t] t
+        _[t|~t] t
+        _[~t|~t] t
 )
 
 type (

diff --git a/src/cmd/compile/internal/test/inl_test.go b/src/cmd/compile/internal/test/inl_test.go
index 211068e..af66a32 100644
--- a/src/cmd/compile/internal/test/inl_test.go
+++ b/src/cmd/compile/internal/test/inl_test.go

@@ -128,15 +128,33 @@
 			"ValidRune",
 		},
 		"reflect": {
-			"Value.CanInt",
-			"Value.CanUint",
-			"Value.CanFloat",
-			"Value.CanComplex",
+			"Value.Bool",
+			"Value.Bytes",
 			"Value.CanAddr",
-			"Value.CanSet",
+			"Value.CanComplex",
+			"Value.CanFloat",
+			"Value.CanInt",
 			"Value.CanInterface",
+			"Value.CanSet",
+			"Value.CanUint",
+			"Value.Cap",
+			"Value.Complex",
+			"Value.Float",
+			"Value.Int",
+			"Value.Interface",
+			"Value.IsNil",
 			"Value.IsValid",
+			"Value.Kind",
+			"Value.Len",
 			"Value.MapRange",
+			"Value.OverflowComplex",
+			"Value.OverflowFloat",
+			"Value.OverflowInt",
+			"Value.OverflowUint",
+			"Value.String",
+			"Value.Type",
+			"Value.Uint",
+			"Value.UnsafeAddr",
 			"Value.pointer",
 			"add",
 			"align",

diff --git a/src/cmd/compile/internal/types2/decl.go b/src/cmd/compile/internal/types2/decl.go
index 95143cb..4f28c36 100644
--- a/src/cmd/compile/internal/types2/decl.go
+++ b/src/cmd/compile/internal/types2/decl.go

@@ -735,7 +735,7 @@
 			top := len(check.delayed)
 
 			// iota is the index of the current constDecl within the group
-			if first < 0 || list[index-1].(*syntax.ConstDecl).Group != s.Group {
+			if first < 0 || s.Group == nil || list[index-1].(*syntax.ConstDecl).Group != s.Group {
 				first = index
 				last = nil
 			}

diff --git a/src/cmd/compile/internal/types2/expr.go b/src/cmd/compile/internal/types2/expr.go
index e0c22f5..33d329f 100644
--- a/src/cmd/compile/internal/types2/expr.go
+++ b/src/cmd/compile/internal/types2/expr.go

@@ -89,21 +89,11 @@
 func (check *Checker) overflow(x *operand) {
 	assert(x.mode == constant_)
 
-	// If the corresponding expression is an operation, use the
-	// operator position rather than the start of the expression
-	// as error position.
-	pos := syntax.StartPos(x.expr)
-	what := "" // operator description, if any
-	if op, _ := x.expr.(*syntax.Operation); op != nil {
-		pos = op.Pos()
-		what = opName(op)
-	}
-
 	if x.val.Kind() == constant.Unknown {
 		// TODO(gri) We should report exactly what went wrong. At the
 		//           moment we don't have the (go/constant) API for that.
 		//           See also TODO in go/constant/value.go.
-		check.error(pos, "constant result is not representable")
+		check.error(opPos(x.expr), "constant result is not representable")
 		return
 	}
 
@@ -119,22 +109,37 @@
 	// Untyped integer values must not grow arbitrarily.
 	const prec = 512 // 512 is the constant precision
 	if x.val.Kind() == constant.Int && constant.BitLen(x.val) > prec {
-		check.errorf(pos, "constant %s overflow", what)
+		check.errorf(opPos(x.expr), "constant %s overflow", opName(x.expr))
 		x.val = constant.MakeUnknown()
 	}
 }
 
-// opName returns the name of an operation, or the empty string.
-// Only operations that might overflow are handled.
-func opName(e *syntax.Operation) string {
-	op := int(e.Op)
-	if e.Y == nil {
-		if op < len(op2str1) {
-			return op2str1[op]
-		}
-	} else {
-		if op < len(op2str2) {
-			return op2str2[op]
+// opPos returns the position of the operator if x is an operation;
+// otherwise it returns the start position of x.
+func opPos(x syntax.Expr) syntax.Pos {
+	switch op := x.(type) {
+	case nil:
+		return nopos // don't crash
+	case *syntax.Operation:
+		return op.Pos()
+	default:
+		return syntax.StartPos(x)
+	}
+}
+
+// opName returns the name of the operation if x is an operation
+// that might overflow; otherwise it returns the empty string.
+func opName(x syntax.Expr) string {
+	if e, _ := x.(*syntax.Operation); e != nil {
+		op := int(e.Op)
+		if e.Y == nil {
+			if op < len(op2str1) {
+				return op2str1[op]
+			}
+		} else {
+			if op < len(op2str2) {
+				return op2str2[op]
+			}
 		}
 	}
 	return ""
@@ -203,6 +208,12 @@
 		x.typ = ch.elem
 		check.hasCallOrRecv = true
 		return
+
+	case syntax.Tilde:
+		// Provide a better error position and message than what check.op below could do.
+		check.error(e, "cannot use ~ outside of interface or type constraint")
+		x.mode = invalid
+		return
 	}
 
 	if !check.op(unaryOpPredicates, x, e.Op) {

diff --git a/src/cmd/compile/internal/types2/resolver.go b/src/cmd/compile/internal/types2/resolver.go
index 5c64ecd..5d498b6 100644
--- a/src/cmd/compile/internal/types2/resolver.go
+++ b/src/cmd/compile/internal/types2/resolver.go

@@ -340,7 +340,7 @@
 
 			case *syntax.ConstDecl:
 				// iota is the index of the current constDecl within the group
-				if first < 0 || file.DeclList[index-1].(*syntax.ConstDecl).Group != s.Group {
+				if first < 0 || s.Group == nil || file.DeclList[index-1].(*syntax.ConstDecl).Group != s.Group {
 					first = index
 					last = nil
 				}

diff --git a/src/cmd/compile/internal/types2/sizes.go b/src/cmd/compile/internal/types2/sizes.go
index f530849..6133e15 100644
--- a/src/cmd/compile/internal/types2/sizes.go
+++ b/src/cmd/compile/internal/types2/sizes.go

@@ -166,10 +166,11 @@
 // common architecture word sizes and alignments
 var gcArchSizes = map[string]*StdSizes{
 	"386":      {4, 4},
-	"arm":      {4, 4},
-	"arm64":    {8, 8},
 	"amd64":    {8, 8},
 	"amd64p32": {4, 8},
+	"arm":      {4, 4},
+	"arm64":    {8, 8},
+	"loong64":  {8, 8},
 	"mips":     {4, 4},
 	"mipsle":   {4, 4},
 	"mips64":   {8, 8},
@@ -188,7 +189,7 @@
 // The result is nil if a compiler/architecture pair is not known.
 //
 // Supported architectures for compiler "gc":
-// "386", "arm", "arm64", "amd64", "amd64p32", "mips", "mipsle",
+// "386", "amd64", "amd64p32", "arm", "arm64", "loong64", "mips", "mipsle",
 // "mips64", "mips64le", "ppc64", "ppc64le", "riscv64", "s390x", "sparc64", "wasm".
 func SizesFor(compiler, arch string) Sizes {
 	var m map[string]*StdSizes

diff --git a/src/cmd/compile/internal/types2/testdata/check/const0.go b/src/cmd/compile/internal/types2/testdata/check/const0.go
index 3cffdf9..229c2486 100644
--- a/src/cmd/compile/internal/types2/testdata/check/const0.go
+++ b/src/cmd/compile/internal/types2/testdata/check/const0.go

@@ -349,6 +349,25 @@
 	assert(iota == 0)
 })
 
+// issue #52438
+const i1 = iota
+const i2 = iota
+const i3 = iota
+
+func _() {
+	assert(i1 == 0)
+	assert(i2 == 0)
+	assert(i3 == 0)
+
+	const i4 = iota
+	const i5 = iota
+	const i6 = iota
+
+	assert(i4 == 0)
+	assert(i5 == 0)
+	assert(i6 == 0)
+}
+
 // untyped constants must not get arbitrarily large
 const prec = 512 // internal maximum precision for integers
 const maxInt = (1<<(prec/2) - 1) * (1<<(prec/2) + 1) // == 1<<prec - 1

diff --git a/src/cmd/compile/internal/types2/testdata/check/expr0.go b/src/cmd/compile/internal/types2/testdata/check/expr0.go
index 1aac726..821b07f 100644
--- a/src/cmd/compile/internal/types2/testdata/check/expr0.go
+++ b/src/cmd/compile/internal/types2/testdata/check/expr0.go

@@ -178,3 +178,10 @@
 	_ = -g /* ERROR 2-valued g */ ()
 	_ = <-g /* ERROR 2-valued g */ ()
 }
+
+// ~ is accepted as unary operator only permitted in interface type elements
+var (
+	_ = ~ /* ERROR cannot use ~ outside of interface or type constraint */ 0
+	_ = ~ /* ERROR cannot use ~ outside of interface or type constraint */ "foo"
+	_ = ~ /* ERROR cannot use ~ outside of interface or type constraint */ i0
+)
\ No newline at end of file

diff --git a/src/cmd/compile/internal/types2/testdata/fixedbugs/issue49482.go b/src/cmd/compile/internal/types2/testdata/fixedbugs/issue49482.go
index f289d2e..503d994 100644
--- a/src/cmd/compile/internal/types2/testdata/fixedbugs/issue49482.go
+++ b/src/cmd/compile/internal/types2/testdata/fixedbugs/issue49482.go

@@ -22,4 +22,4 @@
 type _[P *struct /* ERROR "not an expression" */ {}| int /* ERROR "not an expression" */ ] struct{}
 
 // The following fails to parse, due to the '~'
-type _[P *struct /* ERROR "not an expression" */ {}|~ /* ERROR "unexpected ~" */ int] struct{}
+type _[P *struct /* ERROR "not an expression" */ {}|~int /* ERROR "not an expression" */ ] struct{}

diff --git a/src/cmd/compile/internal/types2/testdata/fixedbugs/issue52401.go b/src/cmd/compile/internal/types2/testdata/fixedbugs/issue52401.go
new file mode 100644
index 0000000..c7efd8c
--- /dev/null
+++ b/src/cmd/compile/internal/types2/testdata/fixedbugs/issue52401.go

@@ -0,0 +1,11 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package p
+
+func _() {
+	const x = 0
+	x /* ERROR cannot assign to x */ += 1
+	x /* ERROR cannot assign to x */ ++
+}

diff --git a/src/cmd/compile/internal/walk/compare.go b/src/cmd/compile/internal/walk/compare.go
index 993f139..fef2d71 100644
--- a/src/cmd/compile/internal/walk/compare.go
+++ b/src/cmd/compile/internal/walk/compare.go

@@ -8,6 +8,7 @@
 	"go/constant"
 
 	"cmd/compile/internal/base"
+	"cmd/compile/internal/compare"
 	"cmd/compile/internal/ir"
 	"cmd/compile/internal/reflectdata"
 	"cmd/compile/internal/ssagen"
@@ -178,7 +179,7 @@
 		andor = ir.OOROR
 	}
 	var expr ir.Node
-	compare := func(el, er ir.Node) {
+	comp := func(el, er ir.Node) {
 		a := ir.NewBinaryExpr(base.Pos, n.Op(), el, er)
 		if expr == nil {
 			expr = a
@@ -186,18 +187,26 @@
 			expr = ir.NewLogicalExpr(base.Pos, andor, expr, a)
 		}
 	}
+	and := func(cond ir.Node) {
+		if expr == nil {
+			expr = cond
+		} else {
+			expr = ir.NewLogicalExpr(base.Pos, andor, expr, cond)
+		}
+	}
 	cmpl = safeExpr(cmpl, init)
 	cmpr = safeExpr(cmpr, init)
 	if t.IsStruct() {
-		for _, f := range t.Fields().Slice() {
-			sym := f.Sym
-			if sym.IsBlank() {
-				continue
+		conds := compare.EqStruct(t, cmpl, cmpr)
+		if n.Op() == ir.OEQ {
+			for _, cond := range conds {
+				and(cond)
 			}
-			compare(
-				ir.NewSelectorExpr(base.Pos, ir.OXDOT, cmpl, sym),
-				ir.NewSelectorExpr(base.Pos, ir.OXDOT, cmpr, sym),
-			)
+		} else {
+			for _, cond := range conds {
+				notCond := ir.NewUnaryExpr(base.Pos, ir.ONOT, cond)
+				and(notCond)
+			}
 		}
 	} else {
 		step := int64(1)
@@ -221,7 +230,7 @@
 				step = 1
 			}
 			if step == 1 {
-				compare(
+				comp(
 					ir.NewIndexExpr(base.Pos, cmpl, ir.NewInt(i)),
 					ir.NewIndexExpr(base.Pos, cmpr, ir.NewInt(i)),
 				)
@@ -249,7 +258,7 @@
 					rb = ir.NewBinaryExpr(base.Pos, ir.OLSH, rb, ir.NewInt(8*t.Elem().Size()*offset))
 					cmprw = ir.NewBinaryExpr(base.Pos, ir.OOR, cmprw, rb)
 				}
-				compare(cmplw, cmprw)
+				comp(cmplw, cmprw)
 				i += step
 				remains -= step * t.Elem().Size()
 			}
@@ -270,7 +279,7 @@
 func walkCompareInterface(n *ir.BinaryExpr, init *ir.Nodes) ir.Node {
 	n.Y = cheapExpr(n.Y, init)
 	n.X = cheapExpr(n.X, init)
-	eqtab, eqdata := reflectdata.EqInterface(n.X, n.Y)
+	eqtab, eqdata := compare.EqInterface(n.X, n.Y)
 	var cmp ir.Node
 	if n.Op() == ir.OEQ {
 		cmp = ir.NewLogicalExpr(base.Pos, ir.OANDAND, eqtab, eqdata)
@@ -384,7 +393,7 @@
 		// prepare for rewrite below
 		n.X = cheapExpr(n.X, init)
 		n.Y = cheapExpr(n.Y, init)
-		eqlen, eqmem := reflectdata.EqString(n.X, n.Y)
+		eqlen, eqmem := compare.EqString(n.X, n.Y)
 		// quick check of len before full compare for == or !=.
 		// memequal then tests equality up to length len.
 		if n.Op() == ir.OEQ {

diff --git a/src/cmd/dist/test.go b/src/cmd/dist/test.go
index ee521f8..7c8f1ea 100644
--- a/src/cmd/dist/test.go
+++ b/src/cmd/dist/test.go

@@ -556,6 +556,55 @@
 		})
 	}
 
+	// morestack tests. We only run these on in long-test mode
+	// (with GO_TEST_SHORT=false) because the runtime test is
+	// already quite long and mayMoreStackMove makes it about
+	// twice as slow.
+	if !t.compileOnly && short() == "false" {
+		// hooks is the set of maymorestack hooks to test with.
+		hooks := []string{"mayMoreStackPreempt", "mayMoreStackMove"}
+		// pkgs is the set of test packages to run.
+		pkgs := []string{"runtime", "reflect", "sync"}
+		// hookPkgs is the set of package patterns to apply
+		// the maymorestack hook to.
+		hookPkgs := []string{"runtime/...", "reflect", "sync"}
+		// unhookPkgs is the set of package patterns to
+		// exclude from hookPkgs.
+		unhookPkgs := []string{"runtime/testdata/..."}
+		for _, hook := range hooks {
+			// Construct the build flags to use the
+			// maymorestack hook in the compiler and
+			// assembler. We pass this via the GOFLAGS
+			// environment variable so that it applies to
+			// both the test itself and to binaries built
+			// by the test.
+			goFlagsList := []string{}
+			for _, flag := range []string{"-gcflags", "-asmflags"} {
+				for _, hookPkg := range hookPkgs {
+					goFlagsList = append(goFlagsList, flag+"="+hookPkg+"=-d=maymorestack=runtime."+hook)
+				}
+				for _, unhookPkg := range unhookPkgs {
+					goFlagsList = append(goFlagsList, flag+"="+unhookPkg+"=")
+				}
+			}
+			goFlags := strings.Join(goFlagsList, " ")
+
+			for _, pkg := range pkgs {
+				pkg := pkg
+				testName := hook + ":" + pkg
+				t.tests = append(t.tests, distTest{
+					name:    testName,
+					heading: "maymorestack=" + hook,
+					fn: func(dt *distTest) error {
+						cmd := t.addCmd(dt, "src", t.goTest(), t.timeout(600), pkg, "-short")
+						setEnv(cmd, "GOFLAGS", goFlags)
+						return nil
+					},
+				})
+			}
+		}
+	}
+
 	// This test needs its stdout/stderr to be terminals, so we don't run it from cmd/go's tests.
 	// See issue 18153.
 	if goos == "linux" {

diff --git a/src/cmd/go/internal/base/env.go b/src/cmd/go/internal/base/env.go
index 5f2665d..2f47300 100644
--- a/src/cmd/go/internal/base/env.go
+++ b/src/cmd/go/internal/base/env.go

@@ -4,12 +4,20 @@
 
 package base
 
+import (
+	"fmt"
+	"path/filepath"
+)
+
 // AppendPWD returns the result of appending PWD=dir to the environment base.
 //
 // The resulting environment makes os.Getwd more efficient for a subprocess
 // running in dir.
 func AppendPWD(base []string, dir string) []string {
-	// Internally we only use absolute paths, so dir is absolute.
-	// Even if dir is not absolute, no harm done.
+	// POSIX requires PWD to be absolute.
+	// Internally we only use absolute paths, so dir should already be absolute.
+	if !filepath.IsAbs(dir) {
+		panic(fmt.Sprintf("AppendPWD with relative path %q", dir))
+	}
 	return append(base, "PWD="+dir)
 }

diff --git a/src/cmd/go/internal/generate/generate_test.go b/src/cmd/go/internal/generate/generate_test.go
index 15b1279..d61ecf1 100644
--- a/src/cmd/go/internal/generate/generate_test.go
+++ b/src/cmd/go/internal/generate/generate_test.go

@@ -5,7 +5,9 @@
 package generate
 
 import (
+	"internal/testenv"
 	"os"
+	"path/filepath"
 	"reflect"
 	"runtime"
 	"testing"
@@ -41,10 +43,11 @@
 }
 
 func TestGenerateCommandParse(t *testing.T) {
+	dir := filepath.Join(testenv.GOROOT(t), "src", "sys")
 	g := &Generator{
 		r:        nil, // Unused here.
-		path:     "/usr/ken/sys/proc.go",
-		dir:      "/usr/ken/sys",
+		path:     filepath.Join(dir, "proc.go"),
+		dir:      dir,
 		file:     "proc.go",
 		pkg:      "sys",
 		commands: make(map[string][]string),
@@ -84,10 +87,11 @@
 //     before executing the test.  i.e., execute the split as if it
 //     processing that source line.
 func TestGenerateCommandShorthand(t *testing.T) {
+	dir := filepath.Join(testenv.GOROOT(t), "src", "sys")
 	g := &Generator{
 		r:        nil, // Unused here.
-		path:     "/usr/ken/sys/proc.go",
-		dir:      "/usr/ken/sys",
+		path:     filepath.Join(dir, "proc.go"),
+		dir:      dir,
 		file:     "proc.go",
 		pkg:      "sys",
 		commands: make(map[string][]string),
@@ -222,10 +226,11 @@
 //     before executing the test.  i.e., execute the split as if it
 //     processing that source line.
 func TestGenerateCommandShortHand2(t *testing.T) {
+	dir := filepath.Join(testenv.GOROOT(t), "src", "sys")
 	g := &Generator{
 		r:        nil, // Unused here.
-		path:     "/usr/ken/sys/proc.go",
-		dir:      "/usr/ken/sys",
+		path:     filepath.Join(dir, "proc.go"),
+		dir:      dir,
 		file:     "proc.go",
 		pkg:      "sys",
 		commands: make(map[string][]string),

diff --git a/src/cmd/go/internal/list/list.go b/src/cmd/go/internal/list/list.go
index 17864e1..e9e0910 100644
--- a/src/cmd/go/internal/list/list.go
+++ b/src/cmd/go/internal/list/list.go

@@ -568,6 +568,13 @@
 		IgnoreImports:   *listFind,
 		ModResolveTests: *listTest,
 		LoadVCS:         true,
+		// SuppressDeps is set if the user opts to explicitly ask for the json fields they
+		// need, don't ask for Deps or DepsErrors. It's not set when using a template string,
+		// even if *listFmt doesn't contain .Deps because Deps are used to find import cycles
+		// for test variants of packages and users who have been providing format strings
+		// might not expect those errors to stop showing up.
+		// See issue #52443.
+		SuppressDeps: !listJsonFields.needAny("Deps", "DepsErrors"),
 	}
 	pkgs := load.PackagesAndErrors(ctx, pkgOpts, args)
 	if !*listE {

diff --git a/src/cmd/go/internal/load/pkg.go b/src/cmd/go/internal/load/pkg.go
index 10799ad..7f2ce32 100644
--- a/src/cmd/go/internal/load/pkg.go
+++ b/src/cmd/go/internal/load/pkg.go

@@ -1944,7 +1944,9 @@
 		}
 	}
 	p.Internal.Imports = imports
-	p.collectDeps()
+	if !opts.SuppressDeps {
+		p.collectDeps()
+	}
 	if p.Error == nil && p.Name == "main" && !p.Internal.ForceLibrary && len(p.DepsErrors) == 0 {
 		// TODO(bcmills): loading VCS metadata can be fairly slow.
 		// Consider starting this as a background goroutine and retrieving the result
@@ -2685,6 +2687,12 @@
 
 	// LoadVCS controls whether we also load version-control metadata for main packages.
 	LoadVCS bool
+
+	// NeedDepsFields is true if the caller does not need Deps and DepsErrors to be populated
+	// on the package. TestPackagesAndErrors examines the  Deps field to determine if the test
+	// variant has an import cycle, so SuppressDeps should not be set if TestPackagesAndErrors
+	// will be called on the package.
+	SuppressDeps bool
 }
 
 // PackagesAndErrors returns the packages named by the command line arguments

diff --git a/src/cmd/go/internal/modload/load.go b/src/cmd/go/internal/modload/load.go
index e85a33d..7f1a88f 100644
--- a/src/cmd/go/internal/modload/load.go
+++ b/src/cmd/go/internal/modload/load.go

@@ -605,11 +605,13 @@
 
 	pkg := pathInModuleCache(ctx, absDir, rs)
 	if pkg == "" {
-		scope := "main module or its selected dependencies"
 		if inWorkspaceMode() {
-			scope = "modules listed in go.work or their selected dependencies"
+			if mr := findModuleRoot(absDir); mr != "" {
+				return "", fmt.Errorf("directory %s is contained in a module that is not one of the workspace modules listed in go.work. You can add the module to the workspace using go work use %s", base.ShortPath(absDir), base.ShortPath(mr))
+			}
+			return "", fmt.Errorf("directory %s outside modules listed in go.work or their selected dependencies", base.ShortPath(absDir))
 		}
-		return "", fmt.Errorf("directory %s outside %s", base.ShortPath(absDir), scope)
+		return "", fmt.Errorf("directory %s outside main module or its selected dependencies", base.ShortPath(absDir))
 	}
 	return pkg, nil
 }

diff --git a/src/cmd/go/internal/vcs/vcs.go b/src/cmd/go/internal/vcs/vcs.go
index 2acabf7..7dbcfb7 100644
--- a/src/cmd/go/internal/vcs/vcs.go
+++ b/src/cmd/go/internal/vcs/vcs.go

@@ -22,7 +22,6 @@
 	"sync"
 	"time"
 
-	"cmd/go/internal/base"
 	"cmd/go/internal/cfg"
 	"cmd/go/internal/search"
 	"cmd/go/internal/str"
@@ -657,7 +656,6 @@
 
 	cmd := exec.Command(v.Cmd, args...)
 	cmd.Dir = dir
-	cmd.Env = base.AppendPWD(os.Environ(), cmd.Dir)
 	if cfg.BuildX {
 		fmt.Fprintf(os.Stderr, "cd %s\n", dir)
 		fmt.Fprintf(os.Stderr, "%s %s\n", v.Cmd, strings.Join(args, " "))
@@ -669,7 +667,7 @@
 			if ee, ok := err.(*exec.ExitError); ok && len(ee.Stderr) > 0 {
 				os.Stderr.Write(ee.Stderr)
 			} else {
-				fmt.Fprintf(os.Stderr, err.Error())
+				fmt.Fprintln(os.Stderr, err.Error())
 			}
 		}
 	}
@@ -678,14 +676,24 @@
 
 // Ping pings to determine scheme to use.
 func (v *Cmd) Ping(scheme, repo string) error {
-	return v.runVerboseOnly(".", v.PingCmd, "scheme", scheme, "repo", repo)
+	// Run the ping command in an arbitrary working directory,
+	// but don't let the current working directory pollute the results.
+	// In module mode, we expect GOMODCACHE to exist and be a safe place for
+	// commands; in GOPATH mode, we expect that to be true of GOPATH/src.
+	dir := cfg.GOMODCACHE
+	if !cfg.ModulesEnabled {
+		dir = filepath.Join(cfg.BuildContext.GOPATH, "src")
+	}
+	os.MkdirAll(dir, 0777) // Ignore errors — if unsuccessful, the command will likely fail.
+
+	return v.runVerboseOnly(dir, v.PingCmd, "scheme", scheme, "repo", repo)
 }
 
 // Create creates a new copy of repo in dir.
 // The parent of dir must exist; dir must not.
 func (v *Cmd) Create(dir, repo string) error {
 	for _, cmd := range v.CreateCmd {
-		if err := v.run(".", cmd, "dir", dir, "repo", repo); err != nil {
+		if err := v.run(filepath.Dir(dir), cmd, "dir", dir, "repo", repo); err != nil {
 			return err
 		}
 	}

diff --git a/src/cmd/go/internal/work/buildid.go b/src/cmd/go/internal/work/buildid.go
index 76335e9..ac98aa3 100644
--- a/src/cmd/go/internal/work/buildid.go
+++ b/src/cmd/go/internal/work/buildid.go

@@ -160,7 +160,6 @@
 
 	cmdline := str.StringList(cfg.BuildToolexec, path, "-V=full")
 	cmd := exec.Command(cmdline[0], cmdline[1:]...)
-	cmd.Env = base.AppendPWD(os.Environ(), cmd.Dir)
 	var stdout, stderr bytes.Buffer
 	cmd.Stdout = &stdout
 	cmd.Stderr = &stderr
@@ -219,9 +218,8 @@
 	// compile an empty file on standard input.
 	cmdline := str.StringList(cfg.BuildToolexec, name, "-###", "-x", language, "-c", "-")
 	cmd := exec.Command(cmdline[0], cmdline[1:]...)
-	cmd.Env = base.AppendPWD(os.Environ(), cmd.Dir)
 	// Force untranslated output so that we see the string "version".
-	cmd.Env = append(cmd.Env, "LC_ALL=C")
+	cmd.Env = append(os.Environ(), "LC_ALL=C")
 	out, err := cmd.CombinedOutput()
 	if err != nil {
 		return "", fmt.Errorf("%s: %v; output: %q", name, err, out)

diff --git a/src/cmd/go/internal/work/exec.go b/src/cmd/go/internal/work/exec.go
index 9c9d58b..0b8e5d2 100644
--- a/src/cmd/go/internal/work/exec.go
+++ b/src/cmd/go/internal/work/exec.go

@@ -2116,8 +2116,10 @@
 	cmd.Stderr = &buf
 	cleanup := passLongArgsInResponseFiles(cmd)
 	defer cleanup()
-	cmd.Dir = dir
-	cmd.Env = base.AppendPWD(os.Environ(), cmd.Dir)
+	if dir != "." {
+		cmd.Dir = dir
+	}
+	cmd.Env = cmd.Environ() // Pre-allocate with correct PWD.
 
 	// Add the TOOLEXEC_IMPORTPATH environment variable for -toolexec tools.
 	// It doesn't really matter if -toolexec isn't being used.
@@ -2606,8 +2608,7 @@
 	}
 	cmd := exec.Command(cmdArgs[0], cmdArgs[1:]...)
 	cmd.Dir = b.WorkDir
-	cmd.Env = base.AppendPWD(os.Environ(), cmd.Dir)
-	cmd.Env = append(cmd.Env, "LC_ALL=C")
+	cmd.Env = append(cmd.Environ(), "LC_ALL=C")
 	out, _ := cmd.CombinedOutput()
 	// GCC says "unrecognized command line option".
 	// clang says "unknown argument".
@@ -3071,7 +3072,7 @@
 )
 
 func (b *Builder) swigDoVersionCheck() error {
-	out, err := b.runOut(nil, "", nil, "swig", "-version")
+	out, err := b.runOut(nil, ".", nil, "swig", "-version")
 	if err != nil {
 		return err
 	}

diff --git a/src/cmd/go/testdata/script/list_json_fields.txt b/src/cmd/go/testdata/script/list_json_fields.txt
index 58c9efa..9b8edc6 100644
--- a/src/cmd/go/testdata/script/list_json_fields.txt
+++ b/src/cmd/go/testdata/script/list_json_fields.txt

@@ -21,6 +21,11 @@
 go list -json=ImportPath,Name,GoFiles,Imports
 cmp stdout want-json-multiple.txt
 
+# Test -json=<field> with Deps outputs the Deps field.
+go list -json=Deps
+stdout '"Deps": \['
+stdout '"errors",'
+
 -- go.mod --
 module example.com/a
 

diff --git a/src/cmd/go/testdata/script/mod_list_direct.txt b/src/cmd/go/testdata/script/mod_list_direct.txt
index 9b7a04c..3aa1881 100644
--- a/src/cmd/go/testdata/script/mod_list_direct.txt
+++ b/src/cmd/go/testdata/script/mod_list_direct.txt

@@ -10,7 +10,7 @@
 # For a while, (*modfetch.codeRepo).Stat was not checking for a go.mod file,
 # which would produce a hard error at the subsequent call to GoMod.
 
-go get
+go get -v
 
 -- go.mod --
 module example.com

diff --git a/src/cmd/go/testdata/script/work_module_not_in_go_work.txt b/src/cmd/go/testdata/script/work_module_not_in_go_work.txt
index 23d908c..9109b2d 100644
--- a/src/cmd/go/testdata/script/work_module_not_in_go_work.txt
+++ b/src/cmd/go/testdata/script/work_module_not_in_go_work.txt

@@ -6,8 +6,8 @@
 ! go list ./...
 stderr 'pattern ./...: directory prefix . does not contain modules listed in go.work or their selected dependencies'
 
-! go list ./a
-stderr 'directory a outside modules listed in go.work'
+! go list ./a/c
+stderr 'directory a[\\/]c is contained in a module that is not one of the workspace modules listed in go.work. You can add the module to the workspace using go work use a'
 
 -- go.work --
 go 1.18
@@ -19,6 +19,8 @@
 go 1.18
 -- a/a.go --
 package a
+-- a/c/c.go --
+package c
 -- b/go.mod --
 module example.com/b
 

diff --git a/src/cmd/gofmt/gofmt.go b/src/cmd/gofmt/gofmt.go
index 5fa883f..9b639bd 100644
--- a/src/cmd/gofmt/gofmt.go
+++ b/src/cmd/gofmt/gofmt.go

@@ -76,6 +76,11 @@
 	if *allErrors {
 		parserMode |= parser.AllErrors
 	}
+	// Both -r and -s make use of go/ast's object resolution.
+	// If neither is being used, avoid that unnecessary work.
+	if *rewriteRule == "" && !*simplifyAST {
+		parserMode |= parser.SkipObjectResolution
+	}
 }
 
 func isGoFile(f fs.DirEntry) bool {

diff --git a/src/cmd/internal/obj/objfile.go b/src/cmd/internal/obj/objfile.go
index c980a7c..2f7ce06 100644
--- a/src/cmd/internal/obj/objfile.go
+++ b/src/cmd/internal/obj/objfile.go

@@ -726,11 +726,13 @@
 		}
 
 		o.Write(&b)
+		p := b.Bytes()
 		isym := &LSym{
 			Type:   objabi.SDATA, // for now, I don't think it matters
 			PkgIdx: goobj.PkgIdxSelf,
 			SymIdx: symidx,
-			P:      append([]byte(nil), b.Bytes()...),
+			P:      append([]byte(nil), p...),
+			Size:   int64(len(p)),
 		}
 		isym.Set(AttrIndexed, true)
 		symidx++

diff --git a/src/cmd/link/internal/ld/stackcheck.go b/src/cmd/link/internal/ld/stackcheck.go
index 520e4d6..f0e1367 100644
--- a/src/cmd/link/internal/ld/stackcheck.go
+++ b/src/cmd/link/internal/ld/stackcheck.go

@@ -101,7 +101,7 @@
 		// the same function multiple times at different
 		// depths, but lets us find all paths.
 		for _, root := range roots {
-			ctxt.Errorf(root, "nosplit stack overflow")
+			ctxt.Errorf(root, "nosplit stack over %d byte limit", limit)
 			chain := []stackCheckChain{{stackCheckEdge{0, root}, false}}
 			sc.report(root, limit, &chain)
 		}

diff --git a/src/cmd/link/internal/ld/stackcheck_test.go b/src/cmd/link/internal/ld/stackcheck_test.go
index 21dbf2b..2089bad 100644
--- a/src/cmd/link/internal/ld/stackcheck_test.go
+++ b/src/cmd/link/internal/ld/stackcheck_test.go

@@ -5,13 +5,12 @@
 package ld
 
 import (
-	"cmd/internal/objabi"
-	"cmd/internal/sys"
 	"fmt"
 	"internal/testenv"
 	"os"
 	"os/exec"
 	"regexp"
+	"strconv"
 	"testing"
 )
 
@@ -24,7 +23,7 @@
 	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", os.DevNull, "./testdata/stackcheck")
 	// The rules for computing frame sizes on all of the
 	// architectures are complicated, so just do this on amd64.
-	cmd.Env = append(os.Environ(), "GOARCH=amd64")
+	cmd.Env = append(os.Environ(), "GOARCH=amd64", "GOOS=linux")
 	outB, err := cmd.CombinedOutput()
 
 	if err == nil {
@@ -34,13 +33,13 @@
 
 	t.Logf("linker output:\n%s", out)
 
-	// Construct expected stanzas
-	arch := sys.ArchAMD64
-	call := 0
-	if !arch.HasLR {
-		call = arch.RegSize
+	// Get expected limit.
+	limitRe := regexp.MustCompile("nosplit stack over ([0-9]+) byte limit")
+	m := limitRe.FindStringSubmatch(out)
+	if m == nil {
+		t.Fatalf("no overflow errors in output")
 	}
-	limit := objabi.StackLimit - call
+	limit, _ := strconv.Atoi(m[1])
 
 	wantMap := map[string]string{
 		"main.startSelf": fmt.Sprintf(
@@ -67,7 +66,7 @@
 	}
 
 	// Parse stanzas
-	stanza := regexp.MustCompile(`^(.*): nosplit stack overflow\n(.*\n(?: .*\n)*)`)
+	stanza := regexp.MustCompile(`^(.*): nosplit stack over [0-9]+ byte limit\n(.*\n(?: .*\n)*)`)
 	// Strip comments from cmd/go
 	out = regexp.MustCompile(`(?m)^#.*\n`).ReplaceAllString(out, "")
 	for len(out) > 0 {

diff --git a/misc/trace/README.md b/src/cmd/trace/static/README.md
similarity index 96%
rename from misc/trace/README.md
rename to src/cmd/trace/static/README.md
index 218d728..f81c59e 100644
--- a/misc/trace/README.md
+++ b/src/cmd/trace/static/README.md

@@ -17,7 +17,7 @@
 $ git clone https://chromium.googlesource.com/catapult
 $ cd catapult
 $ ./tracing/bin/vulcanize_trace_viewer --config=full
-$ cp tracing/bin/trace_viewer_full.html $GOROOT/misc/trace/trace_viewer_full.html
+$ cp tracing/bin/trace_viewer_full.html $GOROOT/src/cmd/trace/static/trace_viewer_full.html
 ```
 
 We are supposed to use --config=lean (produces smaller html),
@@ -31,7 +31,7 @@
 This is copied from the catapult repo.
 
 ```
-$ cp third_party/polymer/components/webcomponentsjs/webcomponents.min.js $GOROOT/misc/trace/webcomponents.min.js
+$ cp third_party/polymer/components/webcomponentsjs/webcomponents.min.js $GOROOT/src/cmd/trace/static/webcomponents.min.js
 ```
 
 ## Licenses

diff --git a/misc/trace/trace_viewer_full.html b/src/cmd/trace/static/trace_viewer_full.html
similarity index 100%
rename from misc/trace/trace_viewer_full.html
rename to src/cmd/trace/static/trace_viewer_full.html


diff --git a/misc/trace/webcomponents.min.js b/src/cmd/trace/static/webcomponents.min.js
similarity index 100%
rename from misc/trace/webcomponents.min.js
rename to src/cmd/trace/static/webcomponents.min.js


diff --git a/src/cmd/trace/trace.go b/src/cmd/trace/trace.go
index 0139639..a0d742a 100644
--- a/src/cmd/trace/trace.go
+++ b/src/cmd/trace/trace.go

@@ -6,6 +6,7 @@
 
 import (
 	"cmd/internal/traceviewer"
+	"embed"
 	"encoding/json"
 	"fmt"
 	"internal/trace"
@@ -13,8 +14,6 @@
 	"log"
 	"math"
 	"net/http"
-	"path/filepath"
-	"runtime"
 	"runtime/debug"
 	"sort"
 	"strconv"
@@ -22,13 +21,16 @@
 	"time"
 )
 
+//go:embed static/trace_viewer_full.html static/webcomponents.min.js
+var staticContent embed.FS
+
 func init() {
 	http.HandleFunc("/trace", httpTrace)
 	http.HandleFunc("/jsontrace", httpJsonTrace)
-	http.HandleFunc("/trace_viewer_html", httpTraceViewerHTML)
-	http.HandleFunc("/webcomponents.min.js", webcomponentsJS)
+	http.Handle("/static/", http.FileServer(http.FS(staticContent)))
 }
 
+
 // httpTrace serves either whole trace (goid==0) or trace for goid goroutine.
 func httpTrace(w http.ResponseWriter, r *http.Request) {
 	_, err := parseTrace()
@@ -50,19 +52,19 @@
 var templTrace = `
 <html>
 <head>
-<script src="/webcomponents.min.js"></script>
+<script src="/static/webcomponents.min.js"></script>
 <script>
 'use strict';
 
 function onTraceViewerImportFail() {
   document.addEventListener('DOMContentLoaded', function() {
     document.body.textContent =
-    '/trace_viewer_full.html is missing. File a bug in https://golang.org/issue';
+    '/static/trace_viewer_full.html is missing. File a bug in https://golang.org/issue';
   });
 }
 </script>
 
-<link rel="import" href="/trace_viewer_html"
+<link rel="import" href="/static/trace_viewer_full.html"
       onerror="onTraceViewerImportFail(event)">
 
 <style type="text/css">
@@ -173,16 +175,6 @@
 </html>
 `
 
-// httpTraceViewerHTML serves static part of trace-viewer.
-// This URL is queried from templTrace HTML.
-func httpTraceViewerHTML(w http.ResponseWriter, r *http.Request) {
-	http.ServeFile(w, r, filepath.Join(runtime.GOROOT(), "misc", "trace", "trace_viewer_full.html"))
-}
-
-func webcomponentsJS(w http.ResponseWriter, r *http.Request) {
-	http.ServeFile(w, r, filepath.Join(runtime.GOROOT(), "misc", "trace", "webcomponents.min.js"))
-}
-
 // httpJsonTrace serves json trace, requested from within templTrace HTML.
 func httpJsonTrace(w http.ResponseWriter, r *http.Request) {
 	defer debug.FreeOSMemory()

diff --git a/src/crypto/ecdsa/ecdsa.go b/src/crypto/ecdsa/ecdsa.go
index c1dd32a..d3ae456 100644
--- a/src/crypto/ecdsa/ecdsa.go
+++ b/src/crypto/ecdsa/ecdsa.go

@@ -146,7 +146,7 @@
 	params := c.Params()
 	// Note that for P-521 this will actually be 63 bits more than the order, as
 	// division rounds down, but the extra bit is inconsequential.
-	b := make([]byte, params.BitSize/8+8) // TODO: use params.N.BitLen()
+	b := make([]byte, params.N.BitLen()/8+8)
 	_, err = io.ReadFull(rand, b)
 	if err != nil {
 		return
@@ -264,13 +264,13 @@
 
 	// Create a CSPRNG that xors a stream of zeros with
 	// the output of the AES-CTR instance.
-	csprng := cipher.StreamReader{
+	csprng := &cipher.StreamReader{
 		R: zeroReader,
 		S: cipher.NewCTR(block, []byte(aesIV)),
 	}
 
 	c := priv.PublicKey.Curve
-	return sign(priv, &csprng, c, hash)
+	return sign(priv, csprng, c, hash)
 }
 
 func signGeneric(priv *PrivateKey, csprng *cipher.StreamReader, c elliptic.Curve, hash []byte) (r, s *big.Int, err error) {
@@ -398,16 +398,14 @@
 	return Verify(pub, hash, r, s)
 }
 
-type zr struct {
-	io.Reader
-}
+type zr struct{}
 
-// Read replaces the contents of dst with zeros.
-func (z *zr) Read(dst []byte) (n int, err error) {
+// Read replaces the contents of dst with zeros. It is safe for concurrent use.
+func (zr) Read(dst []byte) (n int, err error) {
 	for i := range dst {
 		dst[i] = 0
 	}
 	return len(dst), nil
 }
 
-var zeroReader = &zr{}
+var zeroReader = zr{}

diff --git a/src/crypto/ed25519/ed25519vectors_test.go b/src/crypto/ed25519/ed25519vectors_test.go
index 74fcdcd..f933f28 100644
--- a/src/crypto/ed25519/ed25519vectors_test.go
+++ b/src/crypto/ed25519/ed25519vectors_test.go

@@ -74,11 +74,22 @@
 func downloadEd25519Vectors(t *testing.T) []byte {
 	testenv.MustHaveExternalNetwork(t)
 
+	// Create a temp dir and modcache subdir.
+	d := t.TempDir()
+	// Create a spot for the modcache.
+	modcache := filepath.Join(d, "modcache")
+	if err := os.Mkdir(modcache, 0777); err != nil {
+		t.Fatal(err)
+	}
+
+	t.Setenv("GO111MODULE", "on")
+	t.Setenv("GOMODCACHE", modcache)
+
 	// Download the JSON test file from the GOPROXY with `go mod download`,
 	// pinning the version so test and module caching works as expected.
 	goTool := testenv.GoToolPath(t)
 	path := "filippo.io/mostly-harmless/ed25519vectors@v0.0.0-20210322192420-30a2d7243a94"
-	cmd := exec.Command(goTool, "mod", "download", "-json", path)
+	cmd := exec.Command(goTool, "mod", "download", "-modcacherw", "-json", path)
 	// TODO: enable the sumdb once the TryBots proxy supports it.
 	cmd.Env = append(os.Environ(), "GONOSUMDB=*")
 	output, err := cmd.Output()

diff --git a/src/crypto/elliptic/elliptic.go b/src/crypto/elliptic/elliptic.go
index 7ead09f..522d7af 100644
--- a/src/crypto/elliptic/elliptic.go
+++ b/src/crypto/elliptic/elliptic.go

@@ -36,295 +36,6 @@
 	ScalarBaseMult(k []byte) (x, y *big.Int)
 }
 
-func matchesSpecificCurve(params *CurveParams, available ...Curve) (Curve, bool) {
-	for _, c := range available {
-		if params == c.Params() {
-			return c, true
-		}
-	}
-	return nil, false
-}
-
-// CurveParams contains the parameters of an elliptic curve and also provides
-// a generic, non-constant time implementation of Curve.
-type CurveParams struct {
-	P       *big.Int // the order of the underlying field
-	N       *big.Int // the order of the base point
-	B       *big.Int // the constant of the curve equation
-	Gx, Gy  *big.Int // (x,y) of the base point
-	BitSize int      // the size of the underlying field
-	Name    string   // the canonical name of the curve
-}
-
-func (curve *CurveParams) Params() *CurveParams {
-	return curve
-}
-
-// CurveParams operates, internally, on Jacobian coordinates. For a given
-// (x, y) position on the curve, the Jacobian coordinates are (x1, y1, z1)
-// where x = x1/z1² and y = y1/z1³. The greatest speedups come when the whole
-// calculation can be performed within the transform (as in ScalarMult and
-// ScalarBaseMult). But even for Add and Double, it's faster to apply and
-// reverse the transform than to operate in affine coordinates.
-
-// polynomial returns x³ - 3x + b.
-func (curve *CurveParams) polynomial(x *big.Int) *big.Int {
-	x3 := new(big.Int).Mul(x, x)
-	x3.Mul(x3, x)
-
-	threeX := new(big.Int).Lsh(x, 1)
-	threeX.Add(threeX, x)
-
-	x3.Sub(x3, threeX)
-	x3.Add(x3, curve.B)
-	x3.Mod(x3, curve.P)
-
-	return x3
-}
-
-func (curve *CurveParams) IsOnCurve(x, y *big.Int) bool {
-	// If there is a dedicated constant-time implementation for this curve operation,
-	// use that instead of the generic one.
-	if specific, ok := matchesSpecificCurve(curve, p224, p384, p521); ok {
-		return specific.IsOnCurve(x, y)
-	}
-
-	if x.Sign() < 0 || x.Cmp(curve.P) >= 0 ||
-		y.Sign() < 0 || y.Cmp(curve.P) >= 0 {
-		return false
-	}
-
-	// y² = x³ - 3x + b
-	y2 := new(big.Int).Mul(y, y)
-	y2.Mod(y2, curve.P)
-
-	return curve.polynomial(x).Cmp(y2) == 0
-}
-
-// zForAffine returns a Jacobian Z value for the affine point (x, y). If x and
-// y are zero, it assumes that they represent the point at infinity because (0,
-// 0) is not on the any of the curves handled here.
-func zForAffine(x, y *big.Int) *big.Int {
-	z := new(big.Int)
-	if x.Sign() != 0 || y.Sign() != 0 {
-		z.SetInt64(1)
-	}
-	return z
-}
-
-// affineFromJacobian reverses the Jacobian transform. See the comment at the
-// top of the file. If the point is ∞ it returns 0, 0.
-func (curve *CurveParams) affineFromJacobian(x, y, z *big.Int) (xOut, yOut *big.Int) {
-	if z.Sign() == 0 {
-		return new(big.Int), new(big.Int)
-	}
-
-	zinv := new(big.Int).ModInverse(z, curve.P)
-	zinvsq := new(big.Int).Mul(zinv, zinv)
-
-	xOut = new(big.Int).Mul(x, zinvsq)
-	xOut.Mod(xOut, curve.P)
-	zinvsq.Mul(zinvsq, zinv)
-	yOut = new(big.Int).Mul(y, zinvsq)
-	yOut.Mod(yOut, curve.P)
-	return
-}
-
-func (curve *CurveParams) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
-	// If there is a dedicated constant-time implementation for this curve operation,
-	// use that instead of the generic one.
-	if specific, ok := matchesSpecificCurve(curve, p224, p384, p521); ok {
-		return specific.Add(x1, y1, x2, y2)
-	}
-
-	z1 := zForAffine(x1, y1)
-	z2 := zForAffine(x2, y2)
-	return curve.affineFromJacobian(curve.addJacobian(x1, y1, z1, x2, y2, z2))
-}
-
-// addJacobian takes two points in Jacobian coordinates, (x1, y1, z1) and
-// (x2, y2, z2) and returns their sum, also in Jacobian form.
-func (curve *CurveParams) addJacobian(x1, y1, z1, x2, y2, z2 *big.Int) (*big.Int, *big.Int, *big.Int) {
-	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
-	x3, y3, z3 := new(big.Int), new(big.Int), new(big.Int)
-	if z1.Sign() == 0 {
-		x3.Set(x2)
-		y3.Set(y2)
-		z3.Set(z2)
-		return x3, y3, z3
-	}
-	if z2.Sign() == 0 {
-		x3.Set(x1)
-		y3.Set(y1)
-		z3.Set(z1)
-		return x3, y3, z3
-	}
-
-	z1z1 := new(big.Int).Mul(z1, z1)
-	z1z1.Mod(z1z1, curve.P)
-	z2z2 := new(big.Int).Mul(z2, z2)
-	z2z2.Mod(z2z2, curve.P)
-
-	u1 := new(big.Int).Mul(x1, z2z2)
-	u1.Mod(u1, curve.P)
-	u2 := new(big.Int).Mul(x2, z1z1)
-	u2.Mod(u2, curve.P)
-	h := new(big.Int).Sub(u2, u1)
-	xEqual := h.Sign() == 0
-	if h.Sign() == -1 {
-		h.Add(h, curve.P)
-	}
-	i := new(big.Int).Lsh(h, 1)
-	i.Mul(i, i)
-	j := new(big.Int).Mul(h, i)
-
-	s1 := new(big.Int).Mul(y1, z2)
-	s1.Mul(s1, z2z2)
-	s1.Mod(s1, curve.P)
-	s2 := new(big.Int).Mul(y2, z1)
-	s2.Mul(s2, z1z1)
-	s2.Mod(s2, curve.P)
-	r := new(big.Int).Sub(s2, s1)
-	if r.Sign() == -1 {
-		r.Add(r, curve.P)
-	}
-	yEqual := r.Sign() == 0
-	if xEqual && yEqual {
-		return curve.doubleJacobian(x1, y1, z1)
-	}
-	r.Lsh(r, 1)
-	v := new(big.Int).Mul(u1, i)
-
-	x3.Set(r)
-	x3.Mul(x3, x3)
-	x3.Sub(x3, j)
-	x3.Sub(x3, v)
-	x3.Sub(x3, v)
-	x3.Mod(x3, curve.P)
-
-	y3.Set(r)
-	v.Sub(v, x3)
-	y3.Mul(y3, v)
-	s1.Mul(s1, j)
-	s1.Lsh(s1, 1)
-	y3.Sub(y3, s1)
-	y3.Mod(y3, curve.P)
-
-	z3.Add(z1, z2)
-	z3.Mul(z3, z3)
-	z3.Sub(z3, z1z1)
-	z3.Sub(z3, z2z2)
-	z3.Mul(z3, h)
-	z3.Mod(z3, curve.P)
-
-	return x3, y3, z3
-}
-
-func (curve *CurveParams) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
-	// If there is a dedicated constant-time implementation for this curve operation,
-	// use that instead of the generic one.
-	if specific, ok := matchesSpecificCurve(curve, p224, p384, p521); ok {
-		return specific.Double(x1, y1)
-	}
-
-	z1 := zForAffine(x1, y1)
-	return curve.affineFromJacobian(curve.doubleJacobian(x1, y1, z1))
-}
-
-// doubleJacobian takes a point in Jacobian coordinates, (x, y, z), and
-// returns its double, also in Jacobian form.
-func (curve *CurveParams) doubleJacobian(x, y, z *big.Int) (*big.Int, *big.Int, *big.Int) {
-	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
-	delta := new(big.Int).Mul(z, z)
-	delta.Mod(delta, curve.P)
-	gamma := new(big.Int).Mul(y, y)
-	gamma.Mod(gamma, curve.P)
-	alpha := new(big.Int).Sub(x, delta)
-	if alpha.Sign() == -1 {
-		alpha.Add(alpha, curve.P)
-	}
-	alpha2 := new(big.Int).Add(x, delta)
-	alpha.Mul(alpha, alpha2)
-	alpha2.Set(alpha)
-	alpha.Lsh(alpha, 1)
-	alpha.Add(alpha, alpha2)
-
-	beta := alpha2.Mul(x, gamma)
-
-	x3 := new(big.Int).Mul(alpha, alpha)
-	beta8 := new(big.Int).Lsh(beta, 3)
-	beta8.Mod(beta8, curve.P)
-	x3.Sub(x3, beta8)
-	if x3.Sign() == -1 {
-		x3.Add(x3, curve.P)
-	}
-	x3.Mod(x3, curve.P)
-
-	z3 := new(big.Int).Add(y, z)
-	z3.Mul(z3, z3)
-	z3.Sub(z3, gamma)
-	if z3.Sign() == -1 {
-		z3.Add(z3, curve.P)
-	}
-	z3.Sub(z3, delta)
-	if z3.Sign() == -1 {
-		z3.Add(z3, curve.P)
-	}
-	z3.Mod(z3, curve.P)
-
-	beta.Lsh(beta, 2)
-	beta.Sub(beta, x3)
-	if beta.Sign() == -1 {
-		beta.Add(beta, curve.P)
-	}
-	y3 := alpha.Mul(alpha, beta)
-
-	gamma.Mul(gamma, gamma)
-	gamma.Lsh(gamma, 3)
-	gamma.Mod(gamma, curve.P)
-
-	y3.Sub(y3, gamma)
-	if y3.Sign() == -1 {
-		y3.Add(y3, curve.P)
-	}
-	y3.Mod(y3, curve.P)
-
-	return x3, y3, z3
-}
-
-func (curve *CurveParams) ScalarMult(Bx, By *big.Int, k []byte) (*big.Int, *big.Int) {
-	// If there is a dedicated constant-time implementation for this curve operation,
-	// use that instead of the generic one.
-	if specific, ok := matchesSpecificCurve(curve, p224, p256, p384, p521); ok {
-		return specific.ScalarMult(Bx, By, k)
-	}
-
-	Bz := new(big.Int).SetInt64(1)
-	x, y, z := new(big.Int), new(big.Int), new(big.Int)
-
-	for _, byte := range k {
-		for bitNum := 0; bitNum < 8; bitNum++ {
-			x, y, z = curve.doubleJacobian(x, y, z)
-			if byte&0x80 == 0x80 {
-				x, y, z = curve.addJacobian(Bx, By, Bz, x, y, z)
-			}
-			byte <<= 1
-		}
-	}
-
-	return curve.affineFromJacobian(x, y, z)
-}
-
-func (curve *CurveParams) ScalarBaseMult(k []byte) (*big.Int, *big.Int) {
-	// If there is a dedicated constant-time implementation for this curve operation,
-	// use that instead of the generic one.
-	if specific, ok := matchesSpecificCurve(curve, p224, p256, p384, p521); ok {
-		return specific.ScalarBaseMult(k)
-	}
-
-	return curve.ScalarMult(curve.Gx, curve.Gy, k)
-}
-
 var mask = []byte{0xff, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f}
 
 // GenerateKey returns a public/private key pair. The private key is

diff --git a/src/crypto/elliptic/nistec.go b/src/crypto/elliptic/nistec.go
new file mode 100644
index 0000000..c6f170b
--- /dev/null
+++ b/src/crypto/elliptic/nistec.go

@@ -0,0 +1,223 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package elliptic
+
+import (
+	"crypto/elliptic/internal/nistec"
+	"crypto/rand"
+	"errors"
+	"math/big"
+)
+
+var p224 = &nistCurve[*nistec.P224Point]{
+	newPoint:     nistec.NewP224Point,
+	newGenerator: nistec.NewP224Generator,
+}
+
+func initP224() {
+	p224.params = &CurveParams{
+		Name:    "P-224",
+		BitSize: 224,
+		// FIPS 186-4, section D.1.2.2
+		P:  bigFromDecimal("26959946667150639794667015087019630673557916260026308143510066298881"),
+		N:  bigFromDecimal("26959946667150639794667015087019625940457807714424391721682722368061"),
+		B:  bigFromHex("b4050a850c04b3abf54132565044b0b7d7bfd8ba270b39432355ffb4"),
+		Gx: bigFromHex("b70e0cbd6bb4bf7f321390b94a03c1d356c21122343280d6115c1d21"),
+		Gy: bigFromHex("bd376388b5f723fb4c22dfe6cd4375a05a07476444d5819985007e34"),
+	}
+}
+
+var p384 = &nistCurve[*nistec.P384Point]{
+	newPoint:     nistec.NewP384Point,
+	newGenerator: nistec.NewP384Generator,
+}
+
+func initP384() {
+	p384.params = &CurveParams{
+		Name:    "P-384",
+		BitSize: 384,
+		// FIPS 186-4, section D.1.2.4
+		P: bigFromDecimal("394020061963944792122790401001436138050797392704654" +
+			"46667948293404245721771496870329047266088258938001861606973112319"),
+		N: bigFromDecimal("394020061963944792122790401001436138050797392704654" +
+			"46667946905279627659399113263569398956308152294913554433653942643"),
+		B: bigFromHex("b3312fa7e23ee7e4988e056be3f82d19181d9c6efe8141120314088" +
+			"f5013875ac656398d8a2ed19d2a85c8edd3ec2aef"),
+		Gx: bigFromHex("aa87ca22be8b05378eb1c71ef320ad746e1d3b628ba79b9859f741" +
+			"e082542a385502f25dbf55296c3a545e3872760ab7"),
+		Gy: bigFromHex("3617de4a96262c6f5d9e98bf9292dc29f8f41dbd289a147ce9da31" +
+			"13b5f0b8c00a60b1ce1d7e819d7a431d7c90ea0e5f"),
+	}
+}
+
+var p521 = &nistCurve[*nistec.P521Point]{
+	newPoint:     nistec.NewP521Point,
+	newGenerator: nistec.NewP521Generator,
+}
+
+func initP521() {
+	p521.params = &CurveParams{
+		Name:    "P-521",
+		BitSize: 521,
+		// FIPS 186-4, section D.1.2.5
+		P: bigFromDecimal("68647976601306097149819007990813932172694353001433" +
+			"0540939446345918554318339765605212255964066145455497729631139148" +
+			"0858037121987999716643812574028291115057151"),
+		N: bigFromDecimal("68647976601306097149819007990813932172694353001433" +
+			"0540939446345918554318339765539424505774633321719753296399637136" +
+			"3321113864768612440380340372808892707005449"),
+		B: bigFromHex("0051953eb9618e1c9a1f929a21a0b68540eea2da725b99b315f3b8" +
+			"b489918ef109e156193951ec7e937b1652c0bd3bb1bf073573df883d2c34f1ef" +
+			"451fd46b503f00"),
+		Gx: bigFromHex("00c6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f8" +
+			"28af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf9" +
+			"7e7e31c2e5bd66"),
+		Gy: bigFromHex("011839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817" +
+			"afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088" +
+			"be94769fd16650"),
+	}
+}
+
+// nistCurve is a Curve implementation based on a nistec Point.
+//
+// It's a wrapper that exposes the big.Int-based Curve interface and encodes the
+// legacy idiosyncrasies it requires, such as invalid and infinity point
+// handling.
+//
+// To interact with the nistec package, points are encoded into and decoded from
+// properly formatted byte slices. All big.Int use is limited to this package.
+// Encoding and decoding is 1/1000th of the runtime of a scalar multiplication,
+// so the overhead is acceptable.
+type nistCurve[Point nistPoint[Point]] struct {
+	newPoint     func() Point
+	newGenerator func() Point
+	params       *CurveParams
+}
+
+// nistPoint is a generic constraint for the nistec Point types.
+type nistPoint[T any] interface {
+	Bytes() []byte
+	SetBytes([]byte) (T, error)
+	Add(T, T) T
+	Double(T) T
+	ScalarMult(T, []byte) T
+}
+
+func (curve *nistCurve[Point]) Params() *CurveParams {
+	return curve.params
+}
+
+func (curve *nistCurve[Point]) IsOnCurve(x, y *big.Int) bool {
+	// IsOnCurve is documented to reject (0, 0), the conventional point at
+	// infinity, which however is accepted by pointFromAffine.
+	if x.Sign() == 0 && y.Sign() == 0 {
+		return false
+	}
+	_, err := curve.pointFromAffine(x, y)
+	return err == nil
+}
+
+func (curve *nistCurve[Point]) pointFromAffine(x, y *big.Int) (p Point, err error) {
+	p = curve.newPoint()
+	// (0, 0) is by convention the point at infinity, which can't be represented
+	// in affine coordinates. See Issue 37294.
+	if x.Sign() == 0 && y.Sign() == 0 {
+		return p, nil
+	}
+	// Reject values that would not get correctly encoded.
+	if x.Sign() < 0 || y.Sign() < 0 {
+		return p, errors.New("negative coordinate")
+	}
+	if x.BitLen() > curve.params.BitSize || y.BitLen() > curve.params.BitSize {
+		return p, errors.New("overflowing coordinate")
+	}
+	// Encode the coordinates and let SetBytes reject invalid points.
+	byteLen := (curve.params.BitSize + 7) / 8
+	buf := make([]byte, 1+2*byteLen)
+	buf[0] = 4 // uncompressed point
+	x.FillBytes(buf[1 : 1+byteLen])
+	y.FillBytes(buf[1+byteLen : 1+2*byteLen])
+	return p.SetBytes(buf)
+}
+
+func (curve *nistCurve[Point]) pointToAffine(p Point) (x, y *big.Int) {
+	out := p.Bytes()
+	if len(out) == 1 && out[0] == 0 {
+		// This is the correct encoding of the point at infinity, which
+		// Unmarshal does not support. See Issue 37294.
+		return new(big.Int), new(big.Int)
+	}
+	x, y = Unmarshal(curve, out)
+	if x == nil {
+		panic("crypto/elliptic: internal error: Unmarshal rejected a valid point encoding")
+	}
+	return x, y
+}
+
+// randomPoint returns a random point on the curve. It's used when Add,
+// Double, or ScalarMult are fed a point not on the curve, which is undefined
+// behavior. Originally, we used to do the math on it anyway (which allows
+// invalid curve attacks) and relied on the caller and Unmarshal to avoid this
+// happening in the first place. Now, we just can't construct a nistec Point
+// for an invalid pair of coordinates, because that API is safer. If we panic,
+// we risk introducing a DoS. If we return nil, we risk a panic. If we return
+// the input, ecdsa.Verify might fail open. The safest course seems to be to
+// return a valid, random point, which hopefully won't help the attacker.
+func (curve *nistCurve[Point]) randomPoint() (x, y *big.Int) {
+	_, x, y, err := GenerateKey(curve, rand.Reader)
+	if err != nil {
+		panic("crypto/elliptic: failed to generate random point")
+	}
+	return x, y
+}
+
+func (curve *nistCurve[Point]) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
+	p1, err := curve.pointFromAffine(x1, y1)
+	if err != nil {
+		return curve.randomPoint()
+	}
+	p2, err := curve.pointFromAffine(x2, y2)
+	if err != nil {
+		return curve.randomPoint()
+	}
+	return curve.pointToAffine(p1.Add(p1, p2))
+}
+
+func (curve *nistCurve[Point]) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
+	p, err := curve.pointFromAffine(x1, y1)
+	if err != nil {
+		return curve.randomPoint()
+	}
+	return curve.pointToAffine(p.Double(p))
+}
+
+func (curve *nistCurve[Point]) ScalarMult(Bx, By *big.Int, scalar []byte) (*big.Int, *big.Int) {
+	p, err := curve.pointFromAffine(Bx, By)
+	if err != nil {
+		return curve.randomPoint()
+	}
+	return curve.pointToAffine(p.ScalarMult(p, scalar))
+}
+
+func (curve *nistCurve[Point]) ScalarBaseMult(scalar []byte) (*big.Int, *big.Int) {
+	p := curve.newGenerator()
+	return curve.pointToAffine(p.ScalarMult(p, scalar))
+}
+
+func bigFromDecimal(s string) *big.Int {
+	b, ok := new(big.Int).SetString(s, 10)
+	if !ok {
+		panic("invalid encoding")
+	}
+	return b
+}
+
+func bigFromHex(s string) *big.Int {
+	b, ok := new(big.Int).SetString(s, 16)
+	if !ok {
+		panic("invalid encoding")
+	}
+	return b
+}

diff --git a/src/crypto/elliptic/p224.go b/src/crypto/elliptic/p224.go
deleted file mode 100644
index 8a431c4..0000000
--- a/src/crypto/elliptic/p224.go
+++ /dev/null

@@ -1,139 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package elliptic
-
-import (
-	"crypto/elliptic/internal/nistec"
-	"crypto/rand"
-	"math/big"
-)
-
-// p224Curve is a Curve implementation based on nistec.P224Point.
-//
-// It's a wrapper that exposes the big.Int-based Curve interface and encodes the
-// legacy idiosyncrasies it requires, such as invalid and infinity point
-// handling.
-//
-// To interact with the nistec package, points are encoded into and decoded from
-// properly formatted byte slices. All big.Int use is limited to this package.
-// Encoding and decoding is 1/1000th of the runtime of a scalar multiplication,
-// so the overhead is acceptable.
-type p224Curve struct {
-	params *CurveParams
-}
-
-var p224 p224Curve
-var _ Curve = p224
-
-func initP224() {
-	p224.params = &CurveParams{
-		Name:    "P-224",
-		BitSize: 224,
-		// FIPS 186-4, section D.1.2.2
-		P:  bigFromDecimal("26959946667150639794667015087019630673557916260026308143510066298881"),
-		N:  bigFromDecimal("26959946667150639794667015087019625940457807714424391721682722368061"),
-		B:  bigFromHex("b4050a850c04b3abf54132565044b0b7d7bfd8ba270b39432355ffb4"),
-		Gx: bigFromHex("b70e0cbd6bb4bf7f321390b94a03c1d356c21122343280d6115c1d21"),
-		Gy: bigFromHex("bd376388b5f723fb4c22dfe6cd4375a05a07476444d5819985007e34"),
-	}
-}
-
-func (curve p224Curve) Params() *CurveParams {
-	return curve.params
-}
-
-func (curve p224Curve) IsOnCurve(x, y *big.Int) bool {
-	// IsOnCurve is documented to reject (0, 0), the conventional point at
-	// infinity, which however is accepted by p224PointFromAffine.
-	if x.Sign() == 0 && y.Sign() == 0 {
-		return false
-	}
-	_, ok := p224PointFromAffine(x, y)
-	return ok
-}
-
-func p224PointFromAffine(x, y *big.Int) (p *nistec.P224Point, ok bool) {
-	// (0, 0) is by convention the point at infinity, which can't be represented
-	// in affine coordinates. Marshal incorrectly encodes it as an uncompressed
-	// point, which SetBytes would correctly reject. See Issue 37294.
-	if x.Sign() == 0 && y.Sign() == 0 {
-		return nistec.NewP224Point(), true
-	}
-	if x.Sign() < 0 || y.Sign() < 0 {
-		return nil, false
-	}
-	if x.BitLen() > 224 || y.BitLen() > 224 {
-		return nil, false
-	}
-	p, err := nistec.NewP224Point().SetBytes(Marshal(P224(), x, y))
-	if err != nil {
-		return nil, false
-	}
-	return p, true
-}
-
-func p224PointToAffine(p *nistec.P224Point) (x, y *big.Int) {
-	out := p.Bytes()
-	if len(out) == 1 && out[0] == 0 {
-		// This is the correct encoding of the point at infinity, which
-		// Unmarshal does not support. See Issue 37294.
-		return new(big.Int), new(big.Int)
-	}
-	x, y = Unmarshal(P224(), out)
-	if x == nil {
-		panic("crypto/elliptic: internal error: Unmarshal rejected a valid point encoding")
-	}
-	return x, y
-}
-
-// p224RandomPoint returns a random point on the curve. It's used when Add,
-// Double, or ScalarMult are fed a point not on the curve, which is undefined
-// behavior. Originally, we used to do the math on it anyway (which allows
-// invalid curve attacks) and relied on the caller and Unmarshal to avoid this
-// happening in the first place. Now, we just can't construct a nistec.P224Point
-// for an invalid pair of coordinates, because that API is safer. If we panic,
-// we risk introducing a DoS. If we return nil, we risk a panic. If we return
-// the input, ecdsa.Verify might fail open. The safest course seems to be to
-// return a valid, random point, which hopefully won't help the attacker.
-func p224RandomPoint() (x, y *big.Int) {
-	_, x, y, err := GenerateKey(P224(), rand.Reader)
-	if err != nil {
-		panic("crypto/elliptic: failed to generate random point")
-	}
-	return x, y
-}
-
-func (p224Curve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
-	p1, ok := p224PointFromAffine(x1, y1)
-	if !ok {
-		return p224RandomPoint()
-	}
-	p2, ok := p224PointFromAffine(x2, y2)
-	if !ok {
-		return p224RandomPoint()
-	}
-	return p224PointToAffine(p1.Add(p1, p2))
-}
-
-func (p224Curve) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
-	p, ok := p224PointFromAffine(x1, y1)
-	if !ok {
-		return p224RandomPoint()
-	}
-	return p224PointToAffine(p.Double(p))
-}
-
-func (p224Curve) ScalarMult(Bx, By *big.Int, scalar []byte) (*big.Int, *big.Int) {
-	p, ok := p224PointFromAffine(Bx, By)
-	if !ok {
-		return p224RandomPoint()
-	}
-	return p224PointToAffine(p.ScalarMult(p, scalar))
-}
-
-func (p224Curve) ScalarBaseMult(scalar []byte) (*big.Int, *big.Int) {
-	p := nistec.NewP224Generator()
-	return p224PointToAffine(p.ScalarMult(p, scalar))
-}

diff --git a/src/crypto/elliptic/p256.go b/src/crypto/elliptic/p256.go
index 763b842..97ecda5 100644
--- a/src/crypto/elliptic/p256.go
+++ b/src/crypto/elliptic/p256.go

@@ -1,28 +1,19 @@
-// Copyright 2013 The Go Authors. All rights reserved.
+// Copyright 2021 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build !amd64 && !arm64
-
 package elliptic
 
 // P-256 is implemented by various different backends, including a generic
-// 32-bit constant-time one in this file, which is used when assembly
+// 32-bit constant-time one in p256_generic.go, which is used when assembly
 // implementations are not available, or not appropriate for the hardware.
 
 import "math/big"
 
-type p256Curve struct {
-	*CurveParams
-}
+var p256Params *CurveParams
 
-var (
-	p256Params *CurveParams
-
-	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
-	// (2**257).
-	p256RInverse *big.Int
-)
+// RInverse contains 1/R mod p, the inverse of the Montgomery constant 2^257.
+var p256RInverse *big.Int
 
 func initP256() {
 	// See FIPS 186-3, section D.2.3
@@ -39,1162 +30,3 @@
 	// Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation
 	initP256Arch()
 }
-
-func (curve p256Curve) Params() *CurveParams {
-	return curve.CurveParams
-}
-
-// p256GetScalar endian-swaps the big-endian scalar value from in and writes it
-// to out. If the scalar is equal or greater than the order of the group, it's
-// reduced modulo that order.
-func p256GetScalar(out *[32]byte, in []byte) {
-	n := new(big.Int).SetBytes(in)
-	var scalarBytes []byte
-
-	if n.Cmp(p256Params.N) >= 0 || len(in) > len(out) {
-		n.Mod(n, p256Params.N)
-		scalarBytes = n.Bytes()
-	} else {
-		scalarBytes = in
-	}
-
-	for i, v := range scalarBytes {
-		out[len(scalarBytes)-(1+i)] = v
-	}
-}
-
-func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
-	var scalarReversed [32]byte
-	p256GetScalar(&scalarReversed, scalar)
-
-	var x1, y1, z1 [p256Limbs]uint32
-	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
-	return p256ToAffine(&x1, &y1, &z1)
-}
-
-func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
-	var scalarReversed [32]byte
-	p256GetScalar(&scalarReversed, scalar)
-
-	var px, py, x1, y1, z1 [p256Limbs]uint32
-	p256FromBig(&px, bigX)
-	p256FromBig(&py, bigY)
-	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
-	return p256ToAffine(&x1, &y1, &z1)
-}
-
-// Field elements are represented as nine, unsigned 32-bit words.
-//
-// The value of a field element is:
-//   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
-//
-// That is, each limb is alternately 29 or 28-bits wide in little-endian
-// order.
-//
-// This means that a field element hits 2**257, rather than 2**256 as we would
-// like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
-// problems when multiplying as terms end up one bit short of a limb which
-// would require much bit-shifting to correct.
-//
-// Finally, the values stored in a field element are in Montgomery form. So the
-// value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
-// 2**257.
-
-const (
-	p256Limbs    = 9
-	bottom29Bits = 0x1fffffff
-)
-
-var (
-	// p256One is the number 1 as a field element.
-	p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
-	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
-	// p256P is the prime modulus as a field element.
-	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
-	// p2562P is the twice prime modulus as a field element.
-	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
-)
-
-// p256Precomputed contains precomputed values to aid the calculation of scalar
-// multiples of the base point, G. It's actually two, equal length, tables
-// concatenated.
-//
-// The first table contains (x,y) field element pairs for 16 multiples of the
-// base point, G.
-//
-//	Index  |  Index (binary) | Value
-//	    0  |           0000  | 0G (all zeros, omitted)
-//	    1  |           0001  | G
-//	    2  |           0010  | 2**64G
-//	    3  |           0011  | 2**64G + G
-//	    4  |           0100  | 2**128G
-//	    5  |           0101  | 2**128G + G
-//	    6  |           0110  | 2**128G + 2**64G
-//	    7  |           0111  | 2**128G + 2**64G + G
-//	    8  |           1000  | 2**192G
-//	    9  |           1001  | 2**192G + G
-//	   10  |           1010  | 2**192G + 2**64G
-//	   11  |           1011  | 2**192G + 2**64G + G
-//	   12  |           1100  | 2**192G + 2**128G
-//	   13  |           1101  | 2**192G + 2**128G + G
-//	   14  |           1110  | 2**192G + 2**128G + 2**64G
-//	   15  |           1111  | 2**192G + 2**128G + 2**64G + G
-//
-// The second table follows the same style, but the terms are 2**32G,
-// 2**96G, 2**160G, 2**224G.
-//
-// This is ~2KB of data.
-var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
-	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
-	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
-	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
-	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
-	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
-	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
-	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
-	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
-	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
-	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
-	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
-	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
-	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
-	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
-	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
-	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
-	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
-	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
-	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
-	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
-	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
-	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
-	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
-	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
-	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
-	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
-	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
-	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
-	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
-	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
-	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
-	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
-	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
-	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
-	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
-	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
-	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
-	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
-	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
-	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
-	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
-	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
-	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
-	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
-	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
-	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
-	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
-	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
-	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
-	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
-	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
-	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
-	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
-	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
-	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
-	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
-	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
-	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
-	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
-	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
-}
-
-// Field element operations:
-
-const bottom28Bits = 0xfffffff
-
-// nonZeroToAllOnes returns:
-//
-//	0xffffffff for 0 < x <= 2**31
-//	0 for x == 0 or x > 2**31.
-func nonZeroToAllOnes(x uint32) uint32 {
-	return ((x - 1) >> 31) - 1
-}
-
-// p256ReduceCarry adds a multiple of p in order to cancel |carry|,
-// which is a term at 2**257.
-//
-// On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
-// On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
-func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
-	carry_mask := nonZeroToAllOnes(carry)
-
-	inout[0] += carry << 1
-	inout[3] += 0x10000000 & carry_mask
-	// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
-	// previous line therefore this doesn't underflow.
-	inout[3] -= carry << 11
-	inout[4] += (0x20000000 - 1) & carry_mask
-	inout[5] += (0x10000000 - 1) & carry_mask
-	inout[6] += (0x20000000 - 1) & carry_mask
-	inout[6] -= carry << 22
-	// This may underflow if carry is non-zero but, if so, we'll fix it in the
-	// next line.
-	inout[7] -= 1 & carry_mask
-	inout[7] += carry << 25
-}
-
-// p256Sum sets out = in+in2.
-//
-// On entry, in[i]+in2[i] must not overflow a 32-bit word.
-// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
-func p256Sum(out, in, in2 *[p256Limbs]uint32) {
-	carry := uint32(0)
-	for i := 0; ; i++ {
-		out[i] = in[i] + in2[i]
-		out[i] += carry
-		carry = out[i] >> 29
-		out[i] &= bottom29Bits
-
-		i++
-		if i == p256Limbs {
-			break
-		}
-
-		out[i] = in[i] + in2[i]
-		out[i] += carry
-		carry = out[i] >> 28
-		out[i] &= bottom28Bits
-	}
-
-	p256ReduceCarry(out, carry)
-}
-
-const (
-	two30m2    = 1<<30 - 1<<2
-	two30p13m2 = 1<<30 + 1<<13 - 1<<2
-	two31m2    = 1<<31 - 1<<2
-	two31m3    = 1<<31 - 1<<3
-	two31p24m2 = 1<<31 + 1<<24 - 1<<2
-	two30m27m2 = 1<<30 - 1<<27 - 1<<2
-)
-
-// p256Zero31 is 0 mod p.
-var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
-
-// p256Diff sets out = in-in2.
-//
-// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
-// in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
-//
-// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
-func p256Diff(out, in, in2 *[p256Limbs]uint32) {
-	var carry uint32
-
-	for i := 0; ; i++ {
-		out[i] = in[i] - in2[i]
-		out[i] += p256Zero31[i]
-		out[i] += carry
-		carry = out[i] >> 29
-		out[i] &= bottom29Bits
-
-		i++
-		if i == p256Limbs {
-			break
-		}
-
-		out[i] = in[i] - in2[i]
-		out[i] += p256Zero31[i]
-		out[i] += carry
-		carry = out[i] >> 28
-		out[i] &= bottom28Bits
-	}
-
-	p256ReduceCarry(out, carry)
-}
-
-// p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
-// the same 29,28,... bit positions as a field element.
-//
-// The values in field elements are in Montgomery form: x*R mod p where R =
-// 2**257. Since we just multiplied two Montgomery values together, the result
-// is x*y*R*R mod p. We wish to divide by R in order for the result also to be
-// in Montgomery form.
-//
-// On entry: tmp[i] < 2**64
-// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
-func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
-	// The following table may be helpful when reading this code:
-	//
-	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
-	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
-	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
-	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
-	var tmp2 [18]uint32
-	var carry, x, xMask uint32
-
-	// tmp contains 64-bit words with the same 29,28,29-bit positions as a
-	// field element. So the top of an element of tmp might overlap with
-	// another element two positions down. The following loop eliminates
-	// this overlap.
-	tmp2[0] = uint32(tmp[0]) & bottom29Bits
-
-	tmp2[1] = uint32(tmp[0]) >> 29
-	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
-	tmp2[1] += uint32(tmp[1]) & bottom28Bits
-	carry = tmp2[1] >> 28
-	tmp2[1] &= bottom28Bits
-
-	for i := 2; i < 17; i++ {
-		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
-		tmp2[i] += (uint32(tmp[i-1])) >> 28
-		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
-		tmp2[i] += uint32(tmp[i]) & bottom29Bits
-		tmp2[i] += carry
-		carry = tmp2[i] >> 29
-		tmp2[i] &= bottom29Bits
-
-		i++
-		if i == 17 {
-			break
-		}
-		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
-		tmp2[i] += uint32(tmp[i-1]) >> 29
-		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
-		tmp2[i] += uint32(tmp[i]) & bottom28Bits
-		tmp2[i] += carry
-		carry = tmp2[i] >> 28
-		tmp2[i] &= bottom28Bits
-	}
-
-	tmp2[17] = uint32(tmp[15]>>32) >> 25
-	tmp2[17] += uint32(tmp[16]) >> 29
-	tmp2[17] += uint32(tmp[16]>>32) << 3
-	tmp2[17] += carry
-
-	// Montgomery elimination of terms:
-	//
-	// Since R is 2**257, we can divide by R with a bitwise shift if we can
-	// ensure that the right-most 257 bits are all zero. We can make that true
-	// by adding multiplies of p without affecting the value.
-	//
-	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
-	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
-	// We can do that for 8 further limbs and then right shift to eliminate the
-	// extra factor of R.
-	for i := 0; ; i += 2 {
-		tmp2[i+1] += tmp2[i] >> 29
-		x = tmp2[i] & bottom29Bits
-		xMask = nonZeroToAllOnes(x)
-		tmp2[i] = 0
-
-		// The bounds calculations for this loop are tricky. Each iteration of
-		// the loop eliminates two words by adding values to words to their
-		// right.
-		//
-		// The following table contains the amounts added to each word (as an
-		// offset from the value of i at the top of the loop). The amounts are
-		// accounted for from the first and second half of the loop separately
-		// and are written as, for example, 28 to mean a value <2**28.
-		//
-		// Word:                   3   4   5   6   7   8   9   10
-		// Added in top half:     28  11      29  21  29  28
-		//                                        28  29
-		//                                            29
-		// Added in bottom half:      29  10      28  21  28   28
-		//                                            29
-		//
-		// The value that is currently offset 7 will be offset 5 for the next
-		// iteration and then offset 3 for the iteration after that. Therefore
-		// the total value added will be the values added at 7, 5 and 3.
-		//
-		// The following table accumulates these values. The sums at the bottom
-		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
-		//
-		// Word:                   3   4   5   6   7   8   9  10  11  12  13
-		//                        28  11  10  29  21  29  28  28  28  28  28
-		//                            29  28  11  28  29  28  29  28  29  28
-		//                                    29  28  21  21  29  21  29  21
-		//                                        10  29  28  21  28  21  28
-		//                                        28  29  28  29  28  29  28
-		//                                            11  10  29  10  29  10
-		//                                            29  28  11  28  11
-		//                                                    29      29
-		//                        --------------------------------------------
-		//                                                30+ 31+ 30+ 31+ 30+
-		//                                                28+ 29+ 28+ 29+ 21+
-		//                                                21+ 28+ 21+ 28+ 10
-		//                                                10  21+ 10  21+
-		//                                                    11      11
-		//
-		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
-		// tmp2[10/12] has an initial value of <2**29, then the maximum value
-		// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
-		// as required.
-		tmp2[i+3] += (x << 10) & bottom28Bits
-		tmp2[i+4] += (x >> 18)
-
-		tmp2[i+6] += (x << 21) & bottom29Bits
-		tmp2[i+7] += x >> 8
-
-		// At position 200, which is the starting bit position for word 7, we
-		// have a factor of 0xf000000 = 2**28 - 2**24.
-		tmp2[i+7] += 0x10000000 & xMask
-		tmp2[i+8] += (x - 1) & xMask
-		tmp2[i+7] -= (x << 24) & bottom28Bits
-		tmp2[i+8] -= x >> 4
-
-		tmp2[i+8] += 0x20000000 & xMask
-		tmp2[i+8] -= x
-		tmp2[i+8] += (x << 28) & bottom29Bits
-		tmp2[i+9] += ((x >> 1) - 1) & xMask
-
-		if i+1 == p256Limbs {
-			break
-		}
-		tmp2[i+2] += tmp2[i+1] >> 28
-		x = tmp2[i+1] & bottom28Bits
-		xMask = nonZeroToAllOnes(x)
-		tmp2[i+1] = 0
-
-		tmp2[i+4] += (x << 11) & bottom29Bits
-		tmp2[i+5] += (x >> 18)
-
-		tmp2[i+7] += (x << 21) & bottom28Bits
-		tmp2[i+8] += x >> 7
-
-		// At position 199, which is the starting bit of the 8th word when
-		// dealing with a context starting on an odd word, we have a factor of
-		// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
-		// word from i+1 is i+8.
-		tmp2[i+8] += 0x20000000 & xMask
-		tmp2[i+9] += (x - 1) & xMask
-		tmp2[i+8] -= (x << 25) & bottom29Bits
-		tmp2[i+9] -= x >> 4
-
-		tmp2[i+9] += 0x10000000 & xMask
-		tmp2[i+9] -= x
-		tmp2[i+10] += (x - 1) & xMask
-	}
-
-	// We merge the right shift with a carry chain. The words above 2**257 have
-	// widths of 28,29,... which we need to correct when copying them down.
-	carry = 0
-	for i := 0; i < 8; i++ {
-		// The maximum value of tmp2[i + 9] occurs on the first iteration and
-		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
-		// therefore safe.
-		out[i] = tmp2[i+9]
-		out[i] += carry
-		out[i] += (tmp2[i+10] << 28) & bottom29Bits
-		carry = out[i] >> 29
-		out[i] &= bottom29Bits
-
-		i++
-		out[i] = tmp2[i+9] >> 1
-		out[i] += carry
-		carry = out[i] >> 28
-		out[i] &= bottom28Bits
-	}
-
-	out[8] = tmp2[17]
-	out[8] += carry
-	carry = out[8] >> 29
-	out[8] &= bottom29Bits
-
-	p256ReduceCarry(out, carry)
-}
-
-// p256Square sets out=in*in.
-//
-// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
-// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
-func p256Square(out, in *[p256Limbs]uint32) {
-	var tmp [17]uint64
-
-	tmp[0] = uint64(in[0]) * uint64(in[0])
-	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
-	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
-		uint64(in[1])*(uint64(in[1])<<1)
-	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
-		uint64(in[1])*(uint64(in[2])<<1)
-	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
-		uint64(in[1])*(uint64(in[3])<<2) +
-		uint64(in[2])*uint64(in[2])
-	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
-		uint64(in[1])*(uint64(in[4])<<1) +
-		uint64(in[2])*(uint64(in[3])<<1)
-	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
-		uint64(in[1])*(uint64(in[5])<<2) +
-		uint64(in[2])*(uint64(in[4])<<1) +
-		uint64(in[3])*(uint64(in[3])<<1)
-	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
-		uint64(in[1])*(uint64(in[6])<<1) +
-		uint64(in[2])*(uint64(in[5])<<1) +
-		uint64(in[3])*(uint64(in[4])<<1)
-	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
-	// which is < 2**64 as required.
-	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
-		uint64(in[1])*(uint64(in[7])<<2) +
-		uint64(in[2])*(uint64(in[6])<<1) +
-		uint64(in[3])*(uint64(in[5])<<2) +
-		uint64(in[4])*uint64(in[4])
-	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
-		uint64(in[2])*(uint64(in[7])<<1) +
-		uint64(in[3])*(uint64(in[6])<<1) +
-		uint64(in[4])*(uint64(in[5])<<1)
-	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
-		uint64(in[3])*(uint64(in[7])<<2) +
-		uint64(in[4])*(uint64(in[6])<<1) +
-		uint64(in[5])*(uint64(in[5])<<1)
-	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
-		uint64(in[4])*(uint64(in[7])<<1) +
-		uint64(in[5])*(uint64(in[6])<<1)
-	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
-		uint64(in[5])*(uint64(in[7])<<2) +
-		uint64(in[6])*uint64(in[6])
-	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
-		uint64(in[6])*(uint64(in[7])<<1)
-	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
-		uint64(in[7])*(uint64(in[7])<<1)
-	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
-	tmp[16] = uint64(in[8]) * uint64(in[8])
-
-	p256ReduceDegree(out, tmp)
-}
-
-// p256Mul sets out=in*in2.
-//
-// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
-//
-//	in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
-//
-// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
-func p256Mul(out, in, in2 *[p256Limbs]uint32) {
-	var tmp [17]uint64
-
-	tmp[0] = uint64(in[0]) * uint64(in2[0])
-	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
-		uint64(in[1])*(uint64(in2[0])<<0)
-	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
-		uint64(in[1])*(uint64(in2[1])<<1) +
-		uint64(in[2])*(uint64(in2[0])<<0)
-	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
-		uint64(in[1])*(uint64(in2[2])<<0) +
-		uint64(in[2])*(uint64(in2[1])<<0) +
-		uint64(in[3])*(uint64(in2[0])<<0)
-	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
-		uint64(in[1])*(uint64(in2[3])<<1) +
-		uint64(in[2])*(uint64(in2[2])<<0) +
-		uint64(in[3])*(uint64(in2[1])<<1) +
-		uint64(in[4])*(uint64(in2[0])<<0)
-	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
-		uint64(in[1])*(uint64(in2[4])<<0) +
-		uint64(in[2])*(uint64(in2[3])<<0) +
-		uint64(in[3])*(uint64(in2[2])<<0) +
-		uint64(in[4])*(uint64(in2[1])<<0) +
-		uint64(in[5])*(uint64(in2[0])<<0)
-	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
-		uint64(in[1])*(uint64(in2[5])<<1) +
-		uint64(in[2])*(uint64(in2[4])<<0) +
-		uint64(in[3])*(uint64(in2[3])<<1) +
-		uint64(in[4])*(uint64(in2[2])<<0) +
-		uint64(in[5])*(uint64(in2[1])<<1) +
-		uint64(in[6])*(uint64(in2[0])<<0)
-	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
-		uint64(in[1])*(uint64(in2[6])<<0) +
-		uint64(in[2])*(uint64(in2[5])<<0) +
-		uint64(in[3])*(uint64(in2[4])<<0) +
-		uint64(in[4])*(uint64(in2[3])<<0) +
-		uint64(in[5])*(uint64(in2[2])<<0) +
-		uint64(in[6])*(uint64(in2[1])<<0) +
-		uint64(in[7])*(uint64(in2[0])<<0)
-	// tmp[8] has the greatest value but doesn't overflow. See logic in
-	// p256Square.
-	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
-		uint64(in[1])*(uint64(in2[7])<<1) +
-		uint64(in[2])*(uint64(in2[6])<<0) +
-		uint64(in[3])*(uint64(in2[5])<<1) +
-		uint64(in[4])*(uint64(in2[4])<<0) +
-		uint64(in[5])*(uint64(in2[3])<<1) +
-		uint64(in[6])*(uint64(in2[2])<<0) +
-		uint64(in[7])*(uint64(in2[1])<<1) +
-		uint64(in[8])*(uint64(in2[0])<<0)
-	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
-		uint64(in[2])*(uint64(in2[7])<<0) +
-		uint64(in[3])*(uint64(in2[6])<<0) +
-		uint64(in[4])*(uint64(in2[5])<<0) +
-		uint64(in[5])*(uint64(in2[4])<<0) +
-		uint64(in[6])*(uint64(in2[3])<<0) +
-		uint64(in[7])*(uint64(in2[2])<<0) +
-		uint64(in[8])*(uint64(in2[1])<<0)
-	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
-		uint64(in[3])*(uint64(in2[7])<<1) +
-		uint64(in[4])*(uint64(in2[6])<<0) +
-		uint64(in[5])*(uint64(in2[5])<<1) +
-		uint64(in[6])*(uint64(in2[4])<<0) +
-		uint64(in[7])*(uint64(in2[3])<<1) +
-		uint64(in[8])*(uint64(in2[2])<<0)
-	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
-		uint64(in[4])*(uint64(in2[7])<<0) +
-		uint64(in[5])*(uint64(in2[6])<<0) +
-		uint64(in[6])*(uint64(in2[5])<<0) +
-		uint64(in[7])*(uint64(in2[4])<<0) +
-		uint64(in[8])*(uint64(in2[3])<<0)
-	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
-		uint64(in[5])*(uint64(in2[7])<<1) +
-		uint64(in[6])*(uint64(in2[6])<<0) +
-		uint64(in[7])*(uint64(in2[5])<<1) +
-		uint64(in[8])*(uint64(in2[4])<<0)
-	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
-		uint64(in[6])*(uint64(in2[7])<<0) +
-		uint64(in[7])*(uint64(in2[6])<<0) +
-		uint64(in[8])*(uint64(in2[5])<<0)
-	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
-		uint64(in[7])*(uint64(in2[7])<<1) +
-		uint64(in[8])*(uint64(in2[6])<<0)
-	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
-		uint64(in[8])*(uint64(in2[7])<<0)
-	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
-
-	p256ReduceDegree(out, tmp)
-}
-
-func p256Assign(out, in *[p256Limbs]uint32) {
-	*out = *in
-}
-
-// p256Invert calculates |out| = |in|^{-1}
-//
-// Based on Fermat's Little Theorem:
-//
-//	a^p = a (mod p)
-//	a^{p-1} = 1 (mod p)
-//	a^{p-2} = a^{-1} (mod p)
-func p256Invert(out, in *[p256Limbs]uint32) {
-	var ftmp, ftmp2 [p256Limbs]uint32
-
-	// each e_I will hold |in|^{2^I - 1}
-	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
-
-	p256Square(&ftmp, in)     // 2^1
-	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
-	p256Assign(&e2, &ftmp)
-	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
-	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
-	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
-	p256Assign(&e4, &ftmp)
-	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
-	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
-	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
-	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
-	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
-	p256Assign(&e8, &ftmp)
-	for i := 0; i < 8; i++ {
-		p256Square(&ftmp, &ftmp)
-	} // 2^16 - 2^8
-	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
-	p256Assign(&e16, &ftmp)
-	for i := 0; i < 16; i++ {
-		p256Square(&ftmp, &ftmp)
-	} // 2^32 - 2^16
-	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
-	p256Assign(&e32, &ftmp)
-	for i := 0; i < 32; i++ {
-		p256Square(&ftmp, &ftmp)
-	} // 2^64 - 2^32
-	p256Assign(&e64, &ftmp)
-	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
-	for i := 0; i < 192; i++ {
-		p256Square(&ftmp, &ftmp)
-	} // 2^256 - 2^224 + 2^192
-
-	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
-	for i := 0; i < 16; i++ {
-		p256Square(&ftmp2, &ftmp2)
-	} // 2^80 - 2^16
-	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
-	for i := 0; i < 8; i++ {
-		p256Square(&ftmp2, &ftmp2)
-	} // 2^88 - 2^8
-	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
-	for i := 0; i < 4; i++ {
-		p256Square(&ftmp2, &ftmp2)
-	} // 2^92 - 2^4
-	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
-	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
-	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
-	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
-	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
-	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
-	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
-
-	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
-}
-
-// p256Scalar3 sets out=3*out.
-//
-// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
-// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
-func p256Scalar3(out *[p256Limbs]uint32) {
-	var carry uint32
-
-	for i := 0; ; i++ {
-		out[i] *= 3
-		out[i] += carry
-		carry = out[i] >> 29
-		out[i] &= bottom29Bits
-
-		i++
-		if i == p256Limbs {
-			break
-		}
-
-		out[i] *= 3
-		out[i] += carry
-		carry = out[i] >> 28
-		out[i] &= bottom28Bits
-	}
-
-	p256ReduceCarry(out, carry)
-}
-
-// p256Scalar4 sets out=4*out.
-//
-// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
-// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
-func p256Scalar4(out *[p256Limbs]uint32) {
-	var carry, nextCarry uint32
-
-	for i := 0; ; i++ {
-		nextCarry = out[i] >> 27
-		out[i] <<= 2
-		out[i] &= bottom29Bits
-		out[i] += carry
-		carry = nextCarry + (out[i] >> 29)
-		out[i] &= bottom29Bits
-
-		i++
-		if i == p256Limbs {
-			break
-		}
-		nextCarry = out[i] >> 26
-		out[i] <<= 2
-		out[i] &= bottom28Bits
-		out[i] += carry
-		carry = nextCarry + (out[i] >> 28)
-		out[i] &= bottom28Bits
-	}
-
-	p256ReduceCarry(out, carry)
-}
-
-// p256Scalar8 sets out=8*out.
-//
-// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
-// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
-func p256Scalar8(out *[p256Limbs]uint32) {
-	var carry, nextCarry uint32
-
-	for i := 0; ; i++ {
-		nextCarry = out[i] >> 26
-		out[i] <<= 3
-		out[i] &= bottom29Bits
-		out[i] += carry
-		carry = nextCarry + (out[i] >> 29)
-		out[i] &= bottom29Bits
-
-		i++
-		if i == p256Limbs {
-			break
-		}
-		nextCarry = out[i] >> 25
-		out[i] <<= 3
-		out[i] &= bottom28Bits
-		out[i] += carry
-		carry = nextCarry + (out[i] >> 28)
-		out[i] &= bottom28Bits
-	}
-
-	p256ReduceCarry(out, carry)
-}
-
-// Group operations:
-//
-// Elements of the elliptic curve group are represented in Jacobian
-// coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
-// Jacobian form.
-
-// p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
-//
-// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
-func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
-	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
-
-	p256Square(&delta, z)
-	p256Square(&gamma, y)
-	p256Mul(&beta, x, &gamma)
-
-	p256Sum(&tmp, x, &delta)
-	p256Diff(&tmp2, x, &delta)
-	p256Mul(&alpha, &tmp, &tmp2)
-	p256Scalar3(&alpha)
-
-	p256Sum(&tmp, y, z)
-	p256Square(&tmp, &tmp)
-	p256Diff(&tmp, &tmp, &gamma)
-	p256Diff(zOut, &tmp, &delta)
-
-	p256Scalar4(&beta)
-	p256Square(xOut, &alpha)
-	p256Diff(xOut, xOut, &beta)
-	p256Diff(xOut, xOut, &beta)
-
-	p256Diff(&tmp, &beta, xOut)
-	p256Mul(&tmp, &alpha, &tmp)
-	p256Square(&tmp2, &gamma)
-	p256Scalar8(&tmp2)
-	p256Diff(yOut, &tmp, &tmp2)
-}
-
-// p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
-// (i.e. the second point is affine.)
-//
-// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
-//
-// Note that this function does not handle P+P, infinity+P nor P+infinity
-// correctly.
-func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
-	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
-
-	p256Square(&z1z1, z1)
-	p256Sum(&tmp, z1, z1)
-
-	p256Mul(&u2, x2, &z1z1)
-	p256Mul(&z1z1z1, z1, &z1z1)
-	p256Mul(&s2, y2, &z1z1z1)
-	p256Diff(&h, &u2, x1)
-	p256Sum(&i, &h, &h)
-	p256Square(&i, &i)
-	p256Mul(&j, &h, &i)
-	p256Diff(&r, &s2, y1)
-	p256Sum(&r, &r, &r)
-	p256Mul(&v, x1, &i)
-
-	p256Mul(zOut, &tmp, &h)
-	p256Square(&rr, &r)
-	p256Diff(xOut, &rr, &j)
-	p256Diff(xOut, xOut, &v)
-	p256Diff(xOut, xOut, &v)
-
-	p256Diff(&tmp, &v, xOut)
-	p256Mul(yOut, &tmp, &r)
-	p256Mul(&tmp, y1, &j)
-	p256Diff(yOut, yOut, &tmp)
-	p256Diff(yOut, yOut, &tmp)
-}
-
-// p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
-//
-// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
-//
-// Note that this function does not handle P+P, infinity+P nor P+infinity
-// correctly.
-func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
-	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
-
-	p256Square(&z1z1, z1)
-	p256Square(&z2z2, z2)
-	p256Mul(&u1, x1, &z2z2)
-
-	p256Sum(&tmp, z1, z2)
-	p256Square(&tmp, &tmp)
-	p256Diff(&tmp, &tmp, &z1z1)
-	p256Diff(&tmp, &tmp, &z2z2)
-
-	p256Mul(&z2z2z2, z2, &z2z2)
-	p256Mul(&s1, y1, &z2z2z2)
-
-	p256Mul(&u2, x2, &z1z1)
-	p256Mul(&z1z1z1, z1, &z1z1)
-	p256Mul(&s2, y2, &z1z1z1)
-	p256Diff(&h, &u2, &u1)
-	p256Sum(&i, &h, &h)
-	p256Square(&i, &i)
-	p256Mul(&j, &h, &i)
-	p256Diff(&r, &s2, &s1)
-	p256Sum(&r, &r, &r)
-	p256Mul(&v, &u1, &i)
-
-	p256Mul(zOut, &tmp, &h)
-	p256Square(&rr, &r)
-	p256Diff(xOut, &rr, &j)
-	p256Diff(xOut, xOut, &v)
-	p256Diff(xOut, xOut, &v)
-
-	p256Diff(&tmp, &v, xOut)
-	p256Mul(yOut, &tmp, &r)
-	p256Mul(&tmp, &s1, &j)
-	p256Diff(yOut, yOut, &tmp)
-	p256Diff(yOut, yOut, &tmp)
-}
-
-// p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
-//
-// On entry: mask is either 0 or 0xffffffff.
-func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
-	for i := 0; i < p256Limbs; i++ {
-		tmp := mask & (in[i] ^ out[i])
-		out[i] ^= tmp
-	}
-}
-
-// p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
-// On entry: index < 16, table[0] must be zero.
-func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
-	for i := range xOut {
-		xOut[i] = 0
-	}
-	for i := range yOut {
-		yOut[i] = 0
-	}
-
-	for i := uint32(1); i < 16; i++ {
-		mask := i ^ index
-		mask |= mask >> 2
-		mask |= mask >> 1
-		mask &= 1
-		mask--
-		for j := range xOut {
-			xOut[j] |= table[0] & mask
-			table = table[1:]
-		}
-		for j := range yOut {
-			yOut[j] |= table[0] & mask
-			table = table[1:]
-		}
-	}
-}
-
-// p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
-// table.
-// On entry: index < 16, table[0] must be zero.
-func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
-	for i := range xOut {
-		xOut[i] = 0
-	}
-	for i := range yOut {
-		yOut[i] = 0
-	}
-	for i := range zOut {
-		zOut[i] = 0
-	}
-
-	// The implicit value at index 0 is all zero. We don't need to perform that
-	// iteration of the loop because we already set out_* to zero.
-	for i := uint32(1); i < 16; i++ {
-		mask := i ^ index
-		mask |= mask >> 2
-		mask |= mask >> 1
-		mask &= 1
-		mask--
-		for j := range xOut {
-			xOut[j] |= table[i][0][j] & mask
-		}
-		for j := range yOut {
-			yOut[j] |= table[i][1][j] & mask
-		}
-		for j := range zOut {
-			zOut[j] |= table[i][2][j] & mask
-		}
-	}
-}
-
-// p256GetBit returns the bit'th bit of scalar.
-func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
-	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
-}
-
-// p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
-// little-endian number. Note that the value of scalar must be less than the
-// order of the group.
-func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
-	nIsInfinityMask := ^uint32(0)
-	var pIsNoninfiniteMask, mask, tableOffset uint32
-	var px, py, tx, ty, tz [p256Limbs]uint32
-
-	for i := range xOut {
-		xOut[i] = 0
-	}
-	for i := range yOut {
-		yOut[i] = 0
-	}
-	for i := range zOut {
-		zOut[i] = 0
-	}
-
-	// The loop adds bits at positions 0, 64, 128 and 192, followed by
-	// positions 32,96,160 and 224 and does this 32 times.
-	for i := uint(0); i < 32; i++ {
-		if i != 0 {
-			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
-		}
-		tableOffset = 0
-		for j := uint(0); j <= 32; j += 32 {
-			bit0 := p256GetBit(scalar, 31-i+j)
-			bit1 := p256GetBit(scalar, 95-i+j)
-			bit2 := p256GetBit(scalar, 159-i+j)
-			bit3 := p256GetBit(scalar, 223-i+j)
-			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
-
-			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
-			tableOffset += 30 * p256Limbs
-
-			// Since scalar is less than the order of the group, we know that
-			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
-			// below.
-			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
-			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
-			// (a.k.a.  the point at infinity). We handle that situation by
-			// copying the point from the table.
-			p256CopyConditional(xOut, &px, nIsInfinityMask)
-			p256CopyConditional(yOut, &py, nIsInfinityMask)
-			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
-
-			// Equally, the result is also wrong if the point from the table is
-			// zero, which happens when the index is zero. We handle that by
-			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
-			pIsNoninfiniteMask = nonZeroToAllOnes(index)
-			mask = pIsNoninfiniteMask & ^nIsInfinityMask
-			p256CopyConditional(xOut, &tx, mask)
-			p256CopyConditional(yOut, &ty, mask)
-			p256CopyConditional(zOut, &tz, mask)
-			// If p was not zero, then n is now non-zero.
-			nIsInfinityMask &^= pIsNoninfiniteMask
-		}
-	}
-}
-
-// p256PointToAffine converts a Jacobian point to an affine point. If the input
-// is the point at infinity then it returns (0, 0) in constant time.
-func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
-	var zInv, zInvSq [p256Limbs]uint32
-
-	p256Invert(&zInv, z)
-	p256Square(&zInvSq, &zInv)
-	p256Mul(xOut, x, &zInvSq)
-	p256Mul(&zInv, &zInv, &zInvSq)
-	p256Mul(yOut, y, &zInv)
-}
-
-// p256ToAffine returns a pair of *big.Int containing the affine representation
-// of {x,y,z}.
-func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
-	var xx, yy [p256Limbs]uint32
-	p256PointToAffine(&xx, &yy, x, y, z)
-	return p256ToBig(&xx), p256ToBig(&yy)
-}
-
-// p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
-func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
-	var px, py, pz, tx, ty, tz [p256Limbs]uint32
-	var precomp [16][3][p256Limbs]uint32
-	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
-
-	// We precompute 0,1,2,... times {x,y}.
-	precomp[1][0] = *x
-	precomp[1][1] = *y
-	precomp[1][2] = p256One
-
-	for i := 2; i < 16; i += 2 {
-		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
-		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
-	}
-
-	for i := range xOut {
-		xOut[i] = 0
-	}
-	for i := range yOut {
-		yOut[i] = 0
-	}
-	for i := range zOut {
-		zOut[i] = 0
-	}
-	nIsInfinityMask = ^uint32(0)
-
-	// We add in a window of four bits each iteration and do this 64 times.
-	for i := 0; i < 64; i++ {
-		if i != 0 {
-			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
-			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
-			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
-			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
-		}
-
-		index = uint32(scalar[31-i/2])
-		if (i & 1) == 1 {
-			index &= 15
-		} else {
-			index >>= 4
-		}
-
-		// See the comments in scalarBaseMult about handling infinities.
-		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
-		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
-		p256CopyConditional(xOut, &px, nIsInfinityMask)
-		p256CopyConditional(yOut, &py, nIsInfinityMask)
-		p256CopyConditional(zOut, &pz, nIsInfinityMask)
-
-		pIsNoninfiniteMask = nonZeroToAllOnes(index)
-		mask = pIsNoninfiniteMask & ^nIsInfinityMask
-		p256CopyConditional(xOut, &tx, mask)
-		p256CopyConditional(yOut, &ty, mask)
-		p256CopyConditional(zOut, &tz, mask)
-		nIsInfinityMask &^= pIsNoninfiniteMask
-	}
-}
-
-// p256FromBig sets out = R*in.
-func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
-	tmp := new(big.Int).Lsh(in, 257)
-	tmp.Mod(tmp, p256Params.P)
-
-	for i := 0; i < p256Limbs; i++ {
-		if bits := tmp.Bits(); len(bits) > 0 {
-			out[i] = uint32(bits[0]) & bottom29Bits
-		} else {
-			out[i] = 0
-		}
-		tmp.Rsh(tmp, 29)
-
-		i++
-		if i == p256Limbs {
-			break
-		}
-
-		if bits := tmp.Bits(); len(bits) > 0 {
-			out[i] = uint32(bits[0]) & bottom28Bits
-		} else {
-			out[i] = 0
-		}
-		tmp.Rsh(tmp, 28)
-	}
-}
-
-// p256ToBig returns a *big.Int containing the value of in.
-func p256ToBig(in *[p256Limbs]uint32) *big.Int {
-	result, tmp := new(big.Int), new(big.Int)
-
-	result.SetInt64(int64(in[p256Limbs-1]))
-	for i := p256Limbs - 2; i >= 0; i-- {
-		if (i & 1) == 0 {
-			result.Lsh(result, 29)
-		} else {
-			result.Lsh(result, 28)
-		}
-		tmp.SetInt64(int64(in[i]))
-		result.Add(result, tmp)
-	}
-
-	result.Mul(result, p256RInverse)
-	result.Mod(result, p256Params.P)
-	return result
-}

diff --git a/src/crypto/elliptic/p256_asm.go b/src/crypto/elliptic/p256_asm.go
index 93adaf9..ce80282 100644
--- a/src/crypto/elliptic/p256_asm.go
+++ b/src/crypto/elliptic/p256_asm.go

@@ -24,27 +24,18 @@
 //go:embed p256_asm_table.bin
 var p256Precomputed string
 
-type (
-	p256Curve struct {
-		*CurveParams
-	}
+type p256Curve struct {
+	*CurveParams
+}
 
-	p256Point struct {
-		xyz [12]uint64
-	}
-)
+type p256Point struct {
+	xyz [12]uint64
+}
 
 var p256 p256Curve
 
-func initP256() {
-	// See FIPS 186-3, section D.2.3
-	p256.CurveParams = &CurveParams{Name: "P-256"}
-	p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
-	p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
-	p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
-	p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
-	p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
-	p256.BitSize = 256
+func initP256Arch() {
+	p256 = p256Curve{p256Params}
 }
 
 func (curve p256Curve) Params() *CurveParams {

diff --git a/src/crypto/elliptic/p256_generic.go b/src/crypto/elliptic/p256_generic.go
index 7f8fab5..22dde23 100644
--- a/src/crypto/elliptic/p256_generic.go
+++ b/src/crypto/elliptic/p256_generic.go

@@ -1,14 +1,477 @@
-// Copyright 2016 The Go Authors. All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build !amd64 && !s390x && !arm64 && !ppc64le
+//go:build !amd64 && !arm64
 
 package elliptic
 
-var p256 p256Curve
+// This file contains a constant-time, 32-bit implementation of P256.
 
-func initP256Arch() {
-	// Use pure Go implementation.
-	p256 = p256Curve{p256Params}
+import "math/big"
+
+type p256Curve struct {
+	*CurveParams
+}
+
+func (curve p256Curve) Params() *CurveParams {
+	return curve.CurveParams
+}
+
+// p256GetScalar endian-swaps the big-endian scalar value from in and writes it
+// to out. If the scalar is equal or greater than the order of the group, it's
+// reduced modulo that order.
+func p256GetScalar(out *[32]byte, in []byte) {
+	n := new(big.Int).SetBytes(in)
+	var scalarBytes []byte
+
+	if n.Cmp(p256Params.N) >= 0 || len(in) > len(out) {
+		n.Mod(n, p256Params.N)
+		scalarBytes = n.Bytes()
+	} else {
+		scalarBytes = in
+	}
+
+	for i, v := range scalarBytes {
+		out[len(scalarBytes)-(1+i)] = v
+	}
+}
+
+func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
+	var scalarReversed [32]byte
+	p256GetScalar(&scalarReversed, scalar)
+
+	var x1, y1, z1 [p256Limbs]uint32
+	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
+	return p256ToAffine(&x1, &y1, &z1)
+}
+
+func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
+	var scalarReversed [32]byte
+	p256GetScalar(&scalarReversed, scalar)
+
+	var px, py, x1, y1, z1 [p256Limbs]uint32
+	p256FromBig(&px, bigX)
+	p256FromBig(&py, bigY)
+	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
+	return p256ToAffine(&x1, &y1, &z1)
+}
+
+// p256Precomputed contains precomputed values to aid the calculation of scalar
+// multiples of the base point, G. It's actually two, equal length, tables
+// concatenated.
+//
+// The first table contains (x,y) field element pairs for 16 multiples of the
+// base point, G.
+//
+//	Index  |  Index (binary) | Value
+//	    0  |           0000  | 0G (all zeros, omitted)
+//	    1  |           0001  | G
+//	    2  |           0010  | 2**64G
+//	    3  |           0011  | 2**64G + G
+//	    4  |           0100  | 2**128G
+//	    5  |           0101  | 2**128G + G
+//	    6  |           0110  | 2**128G + 2**64G
+//	    7  |           0111  | 2**128G + 2**64G + G
+//	    8  |           1000  | 2**192G
+//	    9  |           1001  | 2**192G + G
+//	   10  |           1010  | 2**192G + 2**64G
+//	   11  |           1011  | 2**192G + 2**64G + G
+//	   12  |           1100  | 2**192G + 2**128G
+//	   13  |           1101  | 2**192G + 2**128G + G
+//	   14  |           1110  | 2**192G + 2**128G + 2**64G
+//	   15  |           1111  | 2**192G + 2**128G + 2**64G + G
+//
+// The second table follows the same style, but the terms are 2**32G,
+// 2**96G, 2**160G, 2**224G.
+//
+// This is ~2KB of data.
+var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
+	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
+	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
+	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
+	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
+	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
+	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
+	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
+	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
+	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
+	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
+	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
+	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
+	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
+	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
+	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
+	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
+	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
+	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
+	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
+	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
+	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
+	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
+	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
+	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
+	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
+	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
+	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
+	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
+	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
+	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
+	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
+	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
+	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
+	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
+	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
+	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
+	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
+	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
+	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
+	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
+	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
+	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
+	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
+	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
+	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
+	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
+	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
+	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
+	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
+	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
+	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
+	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
+	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
+	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
+	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
+	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
+	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
+	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
+	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
+	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
+}
+
+// Group operations:
+//
+// Elements of the elliptic curve group are represented in Jacobian
+// coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
+// Jacobian form.
+
+// p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
+//
+// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
+func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
+	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
+
+	p256Square(&delta, z)
+	p256Square(&gamma, y)
+	p256Mul(&beta, x, &gamma)
+
+	p256Sum(&tmp, x, &delta)
+	p256Diff(&tmp2, x, &delta)
+	p256Mul(&alpha, &tmp, &tmp2)
+	p256Scalar3(&alpha)
+
+	p256Sum(&tmp, y, z)
+	p256Square(&tmp, &tmp)
+	p256Diff(&tmp, &tmp, &gamma)
+	p256Diff(zOut, &tmp, &delta)
+
+	p256Scalar4(&beta)
+	p256Square(xOut, &alpha)
+	p256Diff(xOut, xOut, &beta)
+	p256Diff(xOut, xOut, &beta)
+
+	p256Diff(&tmp, &beta, xOut)
+	p256Mul(&tmp, &alpha, &tmp)
+	p256Square(&tmp2, &gamma)
+	p256Scalar8(&tmp2)
+	p256Diff(yOut, &tmp, &tmp2)
+}
+
+// p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
+// (i.e. the second point is affine.)
+//
+// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
+//
+// Note that this function does not handle P+P, infinity+P nor P+infinity
+// correctly.
+func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
+	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
+
+	p256Square(&z1z1, z1)
+	p256Sum(&tmp, z1, z1)
+
+	p256Mul(&u2, x2, &z1z1)
+	p256Mul(&z1z1z1, z1, &z1z1)
+	p256Mul(&s2, y2, &z1z1z1)
+	p256Diff(&h, &u2, x1)
+	p256Sum(&i, &h, &h)
+	p256Square(&i, &i)
+	p256Mul(&j, &h, &i)
+	p256Diff(&r, &s2, y1)
+	p256Sum(&r, &r, &r)
+	p256Mul(&v, x1, &i)
+
+	p256Mul(zOut, &tmp, &h)
+	p256Square(&rr, &r)
+	p256Diff(xOut, &rr, &j)
+	p256Diff(xOut, xOut, &v)
+	p256Diff(xOut, xOut, &v)
+
+	p256Diff(&tmp, &v, xOut)
+	p256Mul(yOut, &tmp, &r)
+	p256Mul(&tmp, y1, &j)
+	p256Diff(yOut, yOut, &tmp)
+	p256Diff(yOut, yOut, &tmp)
+}
+
+// p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
+//
+// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
+//
+// Note that this function does not handle P+P, infinity+P nor P+infinity
+// correctly.
+func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
+	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
+
+	p256Square(&z1z1, z1)
+	p256Square(&z2z2, z2)
+	p256Mul(&u1, x1, &z2z2)
+
+	p256Sum(&tmp, z1, z2)
+	p256Square(&tmp, &tmp)
+	p256Diff(&tmp, &tmp, &z1z1)
+	p256Diff(&tmp, &tmp, &z2z2)
+
+	p256Mul(&z2z2z2, z2, &z2z2)
+	p256Mul(&s1, y1, &z2z2z2)
+
+	p256Mul(&u2, x2, &z1z1)
+	p256Mul(&z1z1z1, z1, &z1z1)
+	p256Mul(&s2, y2, &z1z1z1)
+	p256Diff(&h, &u2, &u1)
+	p256Sum(&i, &h, &h)
+	p256Square(&i, &i)
+	p256Mul(&j, &h, &i)
+	p256Diff(&r, &s2, &s1)
+	p256Sum(&r, &r, &r)
+	p256Mul(&v, &u1, &i)
+
+	p256Mul(zOut, &tmp, &h)
+	p256Square(&rr, &r)
+	p256Diff(xOut, &rr, &j)
+	p256Diff(xOut, xOut, &v)
+	p256Diff(xOut, xOut, &v)
+
+	p256Diff(&tmp, &v, xOut)
+	p256Mul(yOut, &tmp, &r)
+	p256Mul(&tmp, &s1, &j)
+	p256Diff(yOut, yOut, &tmp)
+	p256Diff(yOut, yOut, &tmp)
+}
+
+// p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
+//
+// On entry: index < 16, table[0] must be zero.
+func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
+	for i := range xOut {
+		xOut[i] = 0
+	}
+	for i := range yOut {
+		yOut[i] = 0
+	}
+
+	for i := uint32(1); i < 16; i++ {
+		mask := i ^ index
+		mask |= mask >> 2
+		mask |= mask >> 1
+		mask &= 1
+		mask--
+		for j := range xOut {
+			xOut[j] |= table[0] & mask
+			table = table[1:]
+		}
+		for j := range yOut {
+			yOut[j] |= table[0] & mask
+			table = table[1:]
+		}
+	}
+}
+
+// p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
+// table.
+//
+// On entry: index < 16, table[0] must be zero.
+func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
+	for i := range xOut {
+		xOut[i] = 0
+	}
+	for i := range yOut {
+		yOut[i] = 0
+	}
+	for i := range zOut {
+		zOut[i] = 0
+	}
+
+	// The implicit value at index 0 is all zero. We don't need to perform that
+	// iteration of the loop because we already set out_* to zero.
+	for i := uint32(1); i < 16; i++ {
+		mask := i ^ index
+		mask |= mask >> 2
+		mask |= mask >> 1
+		mask &= 1
+		mask--
+		for j := range xOut {
+			xOut[j] |= table[i][0][j] & mask
+		}
+		for j := range yOut {
+			yOut[j] |= table[i][1][j] & mask
+		}
+		for j := range zOut {
+			zOut[j] |= table[i][2][j] & mask
+		}
+	}
+}
+
+// p256GetBit returns the bit'th bit of scalar.
+func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
+	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
+}
+
+// p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
+// little-endian number. Note that the value of scalar must be less than the
+// order of the group.
+func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
+	nIsInfinityMask := ^uint32(0)
+	var pIsNoninfiniteMask, mask, tableOffset uint32
+	var px, py, tx, ty, tz [p256Limbs]uint32
+
+	for i := range xOut {
+		xOut[i] = 0
+	}
+	for i := range yOut {
+		yOut[i] = 0
+	}
+	for i := range zOut {
+		zOut[i] = 0
+	}
+
+	// The loop adds bits at positions 0, 64, 128 and 192, followed by
+	// positions 32,96,160 and 224 and does this 32 times.
+	for i := uint(0); i < 32; i++ {
+		if i != 0 {
+			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
+		}
+		tableOffset = 0
+		for j := uint(0); j <= 32; j += 32 {
+			bit0 := p256GetBit(scalar, 31-i+j)
+			bit1 := p256GetBit(scalar, 95-i+j)
+			bit2 := p256GetBit(scalar, 159-i+j)
+			bit3 := p256GetBit(scalar, 223-i+j)
+			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
+
+			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
+			tableOffset += 30 * p256Limbs
+
+			// Since scalar is less than the order of the group, we know that
+			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
+			// below.
+			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
+			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
+			// (a.k.a.  the point at infinity). We handle that situation by
+			// copying the point from the table.
+			p256CopyConditional(xOut, &px, nIsInfinityMask)
+			p256CopyConditional(yOut, &py, nIsInfinityMask)
+			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
+
+			// Equally, the result is also wrong if the point from the table is
+			// zero, which happens when the index is zero. We handle that by
+			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
+			pIsNoninfiniteMask = nonZeroToAllOnes(index)
+			mask = pIsNoninfiniteMask & ^nIsInfinityMask
+			p256CopyConditional(xOut, &tx, mask)
+			p256CopyConditional(yOut, &ty, mask)
+			p256CopyConditional(zOut, &tz, mask)
+			// If p was not zero, then n is now non-zero.
+			nIsInfinityMask &^= pIsNoninfiniteMask
+		}
+	}
+}
+
+// p256PointToAffine converts a Jacobian point to an affine point. If the input
+// is the point at infinity then it returns (0, 0) in constant time.
+func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
+	var zInv, zInvSq [p256Limbs]uint32
+
+	p256Invert(&zInv, z)
+	p256Square(&zInvSq, &zInv)
+	p256Mul(xOut, x, &zInvSq)
+	p256Mul(&zInv, &zInv, &zInvSq)
+	p256Mul(yOut, y, &zInv)
+}
+
+// p256ToAffine returns a pair of *big.Int containing the affine representation
+// of {x,y,z}.
+func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
+	var xx, yy [p256Limbs]uint32
+	p256PointToAffine(&xx, &yy, x, y, z)
+	return p256ToBig(&xx), p256ToBig(&yy)
+}
+
+// p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
+func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
+	var px, py, pz, tx, ty, tz [p256Limbs]uint32
+	var precomp [16][3][p256Limbs]uint32
+	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
+
+	// We precompute 0,1,2,... times {x,y}.
+	precomp[1][0] = *x
+	precomp[1][1] = *y
+	precomp[1][2] = p256One
+
+	for i := 2; i < 16; i += 2 {
+		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
+		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
+	}
+
+	for i := range xOut {
+		xOut[i] = 0
+	}
+	for i := range yOut {
+		yOut[i] = 0
+	}
+	for i := range zOut {
+		zOut[i] = 0
+	}
+	nIsInfinityMask = ^uint32(0)
+
+	// We add in a window of four bits each iteration and do this 64 times.
+	for i := 0; i < 64; i++ {
+		if i != 0 {
+			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
+			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
+			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
+			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
+		}
+
+		index = uint32(scalar[31-i/2])
+		if (i & 1) == 1 {
+			index &= 15
+		} else {
+			index >>= 4
+		}
+
+		// See the comments in scalarBaseMult about handling infinities.
+		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
+		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
+		p256CopyConditional(xOut, &px, nIsInfinityMask)
+		p256CopyConditional(yOut, &py, nIsInfinityMask)
+		p256CopyConditional(zOut, &pz, nIsInfinityMask)
+
+		pIsNoninfiniteMask = nonZeroToAllOnes(index)
+		mask = pIsNoninfiniteMask & ^nIsInfinityMask
+		p256CopyConditional(xOut, &tx, mask)
+		p256CopyConditional(yOut, &ty, mask)
+		p256CopyConditional(zOut, &tz, mask)
+		nIsInfinityMask &^= pIsNoninfiniteMask
+	}
 }

diff --git a/src/crypto/elliptic/p256_generic_field.go b/src/crypto/elliptic/p256_generic_field.go
new file mode 100644
index 0000000..5824946
--- /dev/null
+++ b/src/crypto/elliptic/p256_generic_field.go

@@ -0,0 +1,705 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 && !arm64
+
+package elliptic
+
+import "math/big"
+
+// Field elements are represented as nine, unsigned 32-bit words.
+//
+// The value of a field element is:
+//   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
+//
+// That is, each limb is alternately 29 or 28-bits wide in little-endian
+// order.
+//
+// This means that a field element hits 2**257, rather than 2**256 as we would
+// like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
+// problems when multiplying as terms end up one bit short of a limb which
+// would require much bit-shifting to correct.
+//
+// Finally, the values stored in a field element are in Montgomery form. So the
+// value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
+// 2**257.
+
+const (
+	p256Limbs    = 9
+	bottom29Bits = 0x1fffffff
+)
+
+var (
+	// p256One is the number 1 as a field element.
+	p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
+	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
+	// p256P is the prime modulus as a field element.
+	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
+	// p2562P is the twice prime modulus as a field element.
+	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
+)
+
+// Field element operations:
+
+const bottom28Bits = 0xfffffff
+
+// nonZeroToAllOnes returns:
+//
+//	0xffffffff for 0 < x <= 2**31
+//	0 for x == 0 or x > 2**31.
+func nonZeroToAllOnes(x uint32) uint32 {
+	return ((x - 1) >> 31) - 1
+}
+
+// p256ReduceCarry adds a multiple of p in order to cancel |carry|,
+// which is a term at 2**257.
+//
+// On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
+// On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
+func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
+	carry_mask := nonZeroToAllOnes(carry)
+
+	inout[0] += carry << 1
+	inout[3] += 0x10000000 & carry_mask
+	// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
+	// previous line therefore this doesn't underflow.
+	inout[3] -= carry << 11
+	inout[4] += (0x20000000 - 1) & carry_mask
+	inout[5] += (0x10000000 - 1) & carry_mask
+	inout[6] += (0x20000000 - 1) & carry_mask
+	inout[6] -= carry << 22
+	// This may underflow if carry is non-zero but, if so, we'll fix it in the
+	// next line.
+	inout[7] -= 1 & carry_mask
+	inout[7] += carry << 25
+}
+
+// p256Sum sets out = in+in2.
+//
+// On entry: in[i]+in2[i] must not overflow a 32-bit word.
+// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
+func p256Sum(out, in, in2 *[p256Limbs]uint32) {
+	carry := uint32(0)
+	for i := 0; ; i++ {
+		out[i] = in[i] + in2[i]
+		out[i] += carry
+		carry = out[i] >> 29
+		out[i] &= bottom29Bits
+
+		i++
+		if i == p256Limbs {
+			break
+		}
+
+		out[i] = in[i] + in2[i]
+		out[i] += carry
+		carry = out[i] >> 28
+		out[i] &= bottom28Bits
+	}
+
+	p256ReduceCarry(out, carry)
+}
+
+const (
+	two30m2    = 1<<30 - 1<<2
+	two30p13m2 = 1<<30 + 1<<13 - 1<<2
+	two31m2    = 1<<31 - 1<<2
+	two31m3    = 1<<31 - 1<<3
+	two31p24m2 = 1<<31 + 1<<24 - 1<<2
+	two30m27m2 = 1<<30 - 1<<27 - 1<<2
+)
+
+// p256Zero31 is 0 mod p.
+var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
+
+// p256Diff sets out = in-in2.
+//
+// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
+// in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
+// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
+func p256Diff(out, in, in2 *[p256Limbs]uint32) {
+	var carry uint32
+
+	for i := 0; ; i++ {
+		out[i] = in[i] - in2[i]
+		out[i] += p256Zero31[i]
+		out[i] += carry
+		carry = out[i] >> 29
+		out[i] &= bottom29Bits
+
+		i++
+		if i == p256Limbs {
+			break
+		}
+
+		out[i] = in[i] - in2[i]
+		out[i] += p256Zero31[i]
+		out[i] += carry
+		carry = out[i] >> 28
+		out[i] &= bottom28Bits
+	}
+
+	p256ReduceCarry(out, carry)
+}
+
+// p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
+// the same 29,28,... bit positions as a field element.
+//
+// The values in field elements are in Montgomery form: x*R mod p where R =
+// 2**257. Since we just multiplied two Montgomery values together, the result
+// is x*y*R*R mod p. We wish to divide by R in order for the result also to be
+// in Montgomery form.
+//
+// On entry: tmp[i] < 2**64.
+// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
+func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
+	// The following table may be helpful when reading this code:
+	//
+	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
+	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
+	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
+	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
+	var tmp2 [18]uint32
+	var carry, x, xMask uint32
+
+	// tmp contains 64-bit words with the same 29,28,29-bit positions as a
+	// field element. So the top of an element of tmp might overlap with
+	// another element two positions down. The following loop eliminates
+	// this overlap.
+	tmp2[0] = uint32(tmp[0]) & bottom29Bits
+
+	tmp2[1] = uint32(tmp[0]) >> 29
+	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
+	tmp2[1] += uint32(tmp[1]) & bottom28Bits
+	carry = tmp2[1] >> 28
+	tmp2[1] &= bottom28Bits
+
+	for i := 2; i < 17; i++ {
+		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
+		tmp2[i] += (uint32(tmp[i-1])) >> 28
+		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
+		tmp2[i] += uint32(tmp[i]) & bottom29Bits
+		tmp2[i] += carry
+		carry = tmp2[i] >> 29
+		tmp2[i] &= bottom29Bits
+
+		i++
+		if i == 17 {
+			break
+		}
+		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
+		tmp2[i] += uint32(tmp[i-1]) >> 29
+		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
+		tmp2[i] += uint32(tmp[i]) & bottom28Bits
+		tmp2[i] += carry
+		carry = tmp2[i] >> 28
+		tmp2[i] &= bottom28Bits
+	}
+
+	tmp2[17] = uint32(tmp[15]>>32) >> 25
+	tmp2[17] += uint32(tmp[16]) >> 29
+	tmp2[17] += uint32(tmp[16]>>32) << 3
+	tmp2[17] += carry
+
+	// Montgomery elimination of terms:
+	//
+	// Since R is 2**257, we can divide by R with a bitwise shift if we can
+	// ensure that the right-most 257 bits are all zero. We can make that true
+	// by adding multiplies of p without affecting the value.
+	//
+	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
+	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
+	// We can do that for 8 further limbs and then right shift to eliminate the
+	// extra factor of R.
+	for i := 0; ; i += 2 {
+		tmp2[i+1] += tmp2[i] >> 29
+		x = tmp2[i] & bottom29Bits
+		xMask = nonZeroToAllOnes(x)
+		tmp2[i] = 0
+
+		// The bounds calculations for this loop are tricky. Each iteration of
+		// the loop eliminates two words by adding values to words to their
+		// right.
+		//
+		// The following table contains the amounts added to each word (as an
+		// offset from the value of i at the top of the loop). The amounts are
+		// accounted for from the first and second half of the loop separately
+		// and are written as, for example, 28 to mean a value <2**28.
+		//
+		// Word:                   3   4   5   6   7   8   9   10
+		// Added in top half:     28  11      29  21  29  28
+		//                                        28  29
+		//                                            29
+		// Added in bottom half:      29  10      28  21  28   28
+		//                                            29
+		//
+		// The value that is currently offset 7 will be offset 5 for the next
+		// iteration and then offset 3 for the iteration after that. Therefore
+		// the total value added will be the values added at 7, 5 and 3.
+		//
+		// The following table accumulates these values. The sums at the bottom
+		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
+		//
+		// Word:                   3   4   5   6   7   8   9  10  11  12  13
+		//                        28  11  10  29  21  29  28  28  28  28  28
+		//                            29  28  11  28  29  28  29  28  29  28
+		//                                    29  28  21  21  29  21  29  21
+		//                                        10  29  28  21  28  21  28
+		//                                        28  29  28  29  28  29  28
+		//                                            11  10  29  10  29  10
+		//                                            29  28  11  28  11
+		//                                                    29      29
+		//                        --------------------------------------------
+		//                                                30+ 31+ 30+ 31+ 30+
+		//                                                28+ 29+ 28+ 29+ 21+
+		//                                                21+ 28+ 21+ 28+ 10
+		//                                                10  21+ 10  21+
+		//                                                    11      11
+		//
+		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
+		// tmp2[10/12] has an initial value of <2**29, then the maximum value
+		// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
+		// as required.
+		tmp2[i+3] += (x << 10) & bottom28Bits
+		tmp2[i+4] += (x >> 18)
+
+		tmp2[i+6] += (x << 21) & bottom29Bits
+		tmp2[i+7] += x >> 8
+
+		// At position 200, which is the starting bit position for word 7, we
+		// have a factor of 0xf000000 = 2**28 - 2**24.
+		tmp2[i+7] += 0x10000000 & xMask
+		tmp2[i+8] += (x - 1) & xMask
+		tmp2[i+7] -= (x << 24) & bottom28Bits
+		tmp2[i+8] -= x >> 4
+
+		tmp2[i+8] += 0x20000000 & xMask
+		tmp2[i+8] -= x
+		tmp2[i+8] += (x << 28) & bottom29Bits
+		tmp2[i+9] += ((x >> 1) - 1) & xMask
+
+		if i+1 == p256Limbs {
+			break
+		}
+		tmp2[i+2] += tmp2[i+1] >> 28
+		x = tmp2[i+1] & bottom28Bits
+		xMask = nonZeroToAllOnes(x)
+		tmp2[i+1] = 0
+
+		tmp2[i+4] += (x << 11) & bottom29Bits
+		tmp2[i+5] += (x >> 18)
+
+		tmp2[i+7] += (x << 21) & bottom28Bits
+		tmp2[i+8] += x >> 7
+
+		// At position 199, which is the starting bit of the 8th word when
+		// dealing with a context starting on an odd word, we have a factor of
+		// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
+		// word from i+1 is i+8.
+		tmp2[i+8] += 0x20000000 & xMask
+		tmp2[i+9] += (x - 1) & xMask
+		tmp2[i+8] -= (x << 25) & bottom29Bits
+		tmp2[i+9] -= x >> 4
+
+		tmp2[i+9] += 0x10000000 & xMask
+		tmp2[i+9] -= x
+		tmp2[i+10] += (x - 1) & xMask
+	}
+
+	// We merge the right shift with a carry chain. The words above 2**257 have
+	// widths of 28,29,... which we need to correct when copying them down.
+	carry = 0
+	for i := 0; i < 8; i++ {
+		// The maximum value of tmp2[i + 9] occurs on the first iteration and
+		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
+		// therefore safe.
+		out[i] = tmp2[i+9]
+		out[i] += carry
+		out[i] += (tmp2[i+10] << 28) & bottom29Bits
+		carry = out[i] >> 29
+		out[i] &= bottom29Bits
+
+		i++
+		out[i] = tmp2[i+9] >> 1
+		out[i] += carry
+		carry = out[i] >> 28
+		out[i] &= bottom28Bits
+	}
+
+	out[8] = tmp2[17]
+	out[8] += carry
+	carry = out[8] >> 29
+	out[8] &= bottom29Bits
+
+	p256ReduceCarry(out, carry)
+}
+
+// p256Square sets out=in*in.
+//
+// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
+// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
+func p256Square(out, in *[p256Limbs]uint32) {
+	var tmp [17]uint64
+
+	tmp[0] = uint64(in[0]) * uint64(in[0])
+	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
+	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
+		uint64(in[1])*(uint64(in[1])<<1)
+	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
+		uint64(in[1])*(uint64(in[2])<<1)
+	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
+		uint64(in[1])*(uint64(in[3])<<2) +
+		uint64(in[2])*uint64(in[2])
+	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
+		uint64(in[1])*(uint64(in[4])<<1) +
+		uint64(in[2])*(uint64(in[3])<<1)
+	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
+		uint64(in[1])*(uint64(in[5])<<2) +
+		uint64(in[2])*(uint64(in[4])<<1) +
+		uint64(in[3])*(uint64(in[3])<<1)
+	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
+		uint64(in[1])*(uint64(in[6])<<1) +
+		uint64(in[2])*(uint64(in[5])<<1) +
+		uint64(in[3])*(uint64(in[4])<<1)
+	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
+	// which is < 2**64 as required.
+	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
+		uint64(in[1])*(uint64(in[7])<<2) +
+		uint64(in[2])*(uint64(in[6])<<1) +
+		uint64(in[3])*(uint64(in[5])<<2) +
+		uint64(in[4])*uint64(in[4])
+	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
+		uint64(in[2])*(uint64(in[7])<<1) +
+		uint64(in[3])*(uint64(in[6])<<1) +
+		uint64(in[4])*(uint64(in[5])<<1)
+	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
+		uint64(in[3])*(uint64(in[7])<<2) +
+		uint64(in[4])*(uint64(in[6])<<1) +
+		uint64(in[5])*(uint64(in[5])<<1)
+	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
+		uint64(in[4])*(uint64(in[7])<<1) +
+		uint64(in[5])*(uint64(in[6])<<1)
+	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
+		uint64(in[5])*(uint64(in[7])<<2) +
+		uint64(in[6])*uint64(in[6])
+	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
+		uint64(in[6])*(uint64(in[7])<<1)
+	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
+		uint64(in[7])*(uint64(in[7])<<1)
+	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
+	tmp[16] = uint64(in[8]) * uint64(in[8])
+
+	p256ReduceDegree(out, tmp)
+}
+
+// p256Mul sets out=in*in2.
+//
+// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
+// in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
+// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
+func p256Mul(out, in, in2 *[p256Limbs]uint32) {
+	var tmp [17]uint64
+
+	tmp[0] = uint64(in[0]) * uint64(in2[0])
+	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
+		uint64(in[1])*(uint64(in2[0])<<0)
+	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
+		uint64(in[1])*(uint64(in2[1])<<1) +
+		uint64(in[2])*(uint64(in2[0])<<0)
+	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
+		uint64(in[1])*(uint64(in2[2])<<0) +
+		uint64(in[2])*(uint64(in2[1])<<0) +
+		uint64(in[3])*(uint64(in2[0])<<0)
+	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
+		uint64(in[1])*(uint64(in2[3])<<1) +
+		uint64(in[2])*(uint64(in2[2])<<0) +
+		uint64(in[3])*(uint64(in2[1])<<1) +
+		uint64(in[4])*(uint64(in2[0])<<0)
+	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
+		uint64(in[1])*(uint64(in2[4])<<0) +
+		uint64(in[2])*(uint64(in2[3])<<0) +
+		uint64(in[3])*(uint64(in2[2])<<0) +
+		uint64(in[4])*(uint64(in2[1])<<0) +
+		uint64(in[5])*(uint64(in2[0])<<0)
+	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
+		uint64(in[1])*(uint64(in2[5])<<1) +
+		uint64(in[2])*(uint64(in2[4])<<0) +
+		uint64(in[3])*(uint64(in2[3])<<1) +
+		uint64(in[4])*(uint64(in2[2])<<0) +
+		uint64(in[5])*(uint64(in2[1])<<1) +
+		uint64(in[6])*(uint64(in2[0])<<0)
+	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
+		uint64(in[1])*(uint64(in2[6])<<0) +
+		uint64(in[2])*(uint64(in2[5])<<0) +
+		uint64(in[3])*(uint64(in2[4])<<0) +
+		uint64(in[4])*(uint64(in2[3])<<0) +
+		uint64(in[5])*(uint64(in2[2])<<0) +
+		uint64(in[6])*(uint64(in2[1])<<0) +
+		uint64(in[7])*(uint64(in2[0])<<0)
+	// tmp[8] has the greatest value but doesn't overflow. See logic in
+	// p256Square.
+	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
+		uint64(in[1])*(uint64(in2[7])<<1) +
+		uint64(in[2])*(uint64(in2[6])<<0) +
+		uint64(in[3])*(uint64(in2[5])<<1) +
+		uint64(in[4])*(uint64(in2[4])<<0) +
+		uint64(in[5])*(uint64(in2[3])<<1) +
+		uint64(in[6])*(uint64(in2[2])<<0) +
+		uint64(in[7])*(uint64(in2[1])<<1) +
+		uint64(in[8])*(uint64(in2[0])<<0)
+	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
+		uint64(in[2])*(uint64(in2[7])<<0) +
+		uint64(in[3])*(uint64(in2[6])<<0) +
+		uint64(in[4])*(uint64(in2[5])<<0) +
+		uint64(in[5])*(uint64(in2[4])<<0) +
+		uint64(in[6])*(uint64(in2[3])<<0) +
+		uint64(in[7])*(uint64(in2[2])<<0) +
+		uint64(in[8])*(uint64(in2[1])<<0)
+	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
+		uint64(in[3])*(uint64(in2[7])<<1) +
+		uint64(in[4])*(uint64(in2[6])<<0) +
+		uint64(in[5])*(uint64(in2[5])<<1) +
+		uint64(in[6])*(uint64(in2[4])<<0) +
+		uint64(in[7])*(uint64(in2[3])<<1) +
+		uint64(in[8])*(uint64(in2[2])<<0)
+	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
+		uint64(in[4])*(uint64(in2[7])<<0) +
+		uint64(in[5])*(uint64(in2[6])<<0) +
+		uint64(in[6])*(uint64(in2[5])<<0) +
+		uint64(in[7])*(uint64(in2[4])<<0) +
+		uint64(in[8])*(uint64(in2[3])<<0)
+	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
+		uint64(in[5])*(uint64(in2[7])<<1) +
+		uint64(in[6])*(uint64(in2[6])<<0) +
+		uint64(in[7])*(uint64(in2[5])<<1) +
+		uint64(in[8])*(uint64(in2[4])<<0)
+	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
+		uint64(in[6])*(uint64(in2[7])<<0) +
+		uint64(in[7])*(uint64(in2[6])<<0) +
+		uint64(in[8])*(uint64(in2[5])<<0)
+	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
+		uint64(in[7])*(uint64(in2[7])<<1) +
+		uint64(in[8])*(uint64(in2[6])<<0)
+	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
+		uint64(in[8])*(uint64(in2[7])<<0)
+	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
+
+	p256ReduceDegree(out, tmp)
+}
+
+func p256Assign(out, in *[p256Limbs]uint32) {
+	*out = *in
+}
+
+// p256Invert calculates |out| = |in|^{-1}
+//
+// Based on Fermat's Little Theorem:
+//
+//	a^p = a (mod p)
+//	a^{p-1} = 1 (mod p)
+//	a^{p-2} = a^{-1} (mod p)
+func p256Invert(out, in *[p256Limbs]uint32) {
+	var ftmp, ftmp2 [p256Limbs]uint32
+
+	// each e_I will hold |in|^{2^I - 1}
+	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
+
+	p256Square(&ftmp, in)     // 2^1
+	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
+	p256Assign(&e2, &ftmp)
+	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
+	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
+	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
+	p256Assign(&e4, &ftmp)
+	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
+	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
+	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
+	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
+	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
+	p256Assign(&e8, &ftmp)
+	for i := 0; i < 8; i++ {
+		p256Square(&ftmp, &ftmp)
+	} // 2^16 - 2^8
+	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
+	p256Assign(&e16, &ftmp)
+	for i := 0; i < 16; i++ {
+		p256Square(&ftmp, &ftmp)
+	} // 2^32 - 2^16
+	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
+	p256Assign(&e32, &ftmp)
+	for i := 0; i < 32; i++ {
+		p256Square(&ftmp, &ftmp)
+	} // 2^64 - 2^32
+	p256Assign(&e64, &ftmp)
+	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
+	for i := 0; i < 192; i++ {
+		p256Square(&ftmp, &ftmp)
+	} // 2^256 - 2^224 + 2^192
+
+	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
+	for i := 0; i < 16; i++ {
+		p256Square(&ftmp2, &ftmp2)
+	} // 2^80 - 2^16
+	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
+	for i := 0; i < 8; i++ {
+		p256Square(&ftmp2, &ftmp2)
+	} // 2^88 - 2^8
+	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
+	for i := 0; i < 4; i++ {
+		p256Square(&ftmp2, &ftmp2)
+	} // 2^92 - 2^4
+	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
+	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
+	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
+	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
+	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
+	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
+	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
+
+	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
+}
+
+// p256Scalar3 sets out=3*out.
+//
+// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
+// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
+func p256Scalar3(out *[p256Limbs]uint32) {
+	var carry uint32
+
+	for i := 0; ; i++ {
+		out[i] *= 3
+		out[i] += carry
+		carry = out[i] >> 29
+		out[i] &= bottom29Bits
+
+		i++
+		if i == p256Limbs {
+			break
+		}
+
+		out[i] *= 3
+		out[i] += carry
+		carry = out[i] >> 28
+		out[i] &= bottom28Bits
+	}
+
+	p256ReduceCarry(out, carry)
+}
+
+// p256Scalar4 sets out=4*out.
+//
+// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
+// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
+func p256Scalar4(out *[p256Limbs]uint32) {
+	var carry, nextCarry uint32
+
+	for i := 0; ; i++ {
+		nextCarry = out[i] >> 27
+		out[i] <<= 2
+		out[i] &= bottom29Bits
+		out[i] += carry
+		carry = nextCarry + (out[i] >> 29)
+		out[i] &= bottom29Bits
+
+		i++
+		if i == p256Limbs {
+			break
+		}
+		nextCarry = out[i] >> 26
+		out[i] <<= 2
+		out[i] &= bottom28Bits
+		out[i] += carry
+		carry = nextCarry + (out[i] >> 28)
+		out[i] &= bottom28Bits
+	}
+
+	p256ReduceCarry(out, carry)
+}
+
+// p256Scalar8 sets out=8*out.
+//
+// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
+// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
+func p256Scalar8(out *[p256Limbs]uint32) {
+	var carry, nextCarry uint32
+
+	for i := 0; ; i++ {
+		nextCarry = out[i] >> 26
+		out[i] <<= 3
+		out[i] &= bottom29Bits
+		out[i] += carry
+		carry = nextCarry + (out[i] >> 29)
+		out[i] &= bottom29Bits
+
+		i++
+		if i == p256Limbs {
+			break
+		}
+		nextCarry = out[i] >> 25
+		out[i] <<= 3
+		out[i] &= bottom28Bits
+		out[i] += carry
+		carry = nextCarry + (out[i] >> 28)
+		out[i] &= bottom28Bits
+	}
+
+	p256ReduceCarry(out, carry)
+}
+
+// p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
+//
+// On entry: mask is either 0 or 0xffffffff.
+func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
+	for i := 0; i < p256Limbs; i++ {
+		tmp := mask & (in[i] ^ out[i])
+		out[i] ^= tmp
+	}
+}
+
+// p256FromBig sets out = R*in.
+func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
+	tmp := new(big.Int).Lsh(in, 257)
+	tmp.Mod(tmp, p256Params.P)
+
+	for i := 0; i < p256Limbs; i++ {
+		if bits := tmp.Bits(); len(bits) > 0 {
+			out[i] = uint32(bits[0]) & bottom29Bits
+		} else {
+			out[i] = 0
+		}
+		tmp.Rsh(tmp, 29)
+
+		i++
+		if i == p256Limbs {
+			break
+		}
+
+		if bits := tmp.Bits(); len(bits) > 0 {
+			out[i] = uint32(bits[0]) & bottom28Bits
+		} else {
+			out[i] = 0
+		}
+		tmp.Rsh(tmp, 28)
+	}
+}
+
+// p256ToBig returns a *big.Int containing the value of in.
+func p256ToBig(in *[p256Limbs]uint32) *big.Int {
+	result, tmp := new(big.Int), new(big.Int)
+
+	result.SetInt64(int64(in[p256Limbs-1]))
+	for i := p256Limbs - 2; i >= 0; i-- {
+		if (i & 1) == 0 {
+			result.Lsh(result, 29)
+		} else {
+			result.Lsh(result, 28)
+		}
+		tmp.SetInt64(int64(in[i]))
+		result.Add(result, tmp)
+	}
+
+	result.Mul(result, p256RInverse)
+	result.Mod(result, p256Params.P)
+	return result
+}

diff --git a/src/crypto/elliptic/p256_noasm.go b/src/crypto/elliptic/p256_noasm.go
new file mode 100644
index 0000000..380ea66
--- /dev/null
+++ b/src/crypto/elliptic/p256_noasm.go

@@ -0,0 +1,15 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 && !s390x && !arm64 && !ppc64le
+// +build !amd64,!s390x,!arm64,!ppc64le
+
+package elliptic
+
+var p256 p256Curve
+
+func initP256Arch() {
+	// Use pure Go constant-time implementation.
+	p256 = p256Curve{p256Params}
+}

diff --git a/src/crypto/elliptic/p256_ppc64le.go b/src/crypto/elliptic/p256_ppc64le.go
index dda1157..3867a87 100644
--- a/src/crypto/elliptic/p256_ppc64le.go
+++ b/src/crypto/elliptic/p256_ppc64le.go

@@ -35,7 +35,6 @@
 func initP256Arch() {
 	p256 = p256CurveFast{p256Params}
 	initTable()
-	return
 }
 
 func (curve p256CurveFast) Params() *CurveParams {
@@ -73,7 +72,6 @@
 //go:noescape
 func p256Select(point *p256Point, table []p256Point, idx int)
 
-//
 //go:noescape
 func p256SelectBase(point *p256Point, table []p256Point, idx int)
 
@@ -85,12 +83,9 @@
 //go:noescape
 func p256PointAddAffineAsm(res, in1, in2 *p256Point, sign, sel, zero int)
 
-// Point add
-//
 //go:noescape
 func p256PointAddAsm(res, in1, in2 *p256Point) int
 
-//
 //go:noescape
 func p256PointDoubleAsm(res, in *p256Point)
 
@@ -340,7 +335,6 @@
 }
 
 func initTable() {
-
 	p256PreFast = new([37][64]p256Point)
 
 	// TODO: For big endian, these slices should be in reverse byte order,
@@ -352,7 +346,6 @@
 			0x25, 0xf3, 0x21, 0xdd, 0x88, 0x86, 0xe8, 0xd2, 0x85, 0x5d, 0x88, 0x25, 0x18, 0xff, 0x71, 0x85}, //(p256.y*2^256)%p
 		z: [32]byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
 			0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}, //(p256.z*2^256)%p
-
 	}
 
 	t1 := new(p256Point)

diff --git a/src/crypto/elliptic/p256_s390x.go b/src/crypto/elliptic/p256_s390x.go
index 735e9f5..b7331eb 100644
--- a/src/crypto/elliptic/p256_s390x.go
+++ b/src/crypto/elliptic/p256_s390x.go

@@ -60,7 +60,6 @@
 
 	// No vector support, use pure Go implementation.
 	p256 = p256Curve{p256Params}
-	return
 }
 
 func (curve p256CurveFast) Params() *CurveParams {

diff --git a/src/crypto/elliptic/p384.go b/src/crypto/elliptic/p384.go
deleted file mode 100644
index 33a441d..0000000
--- a/src/crypto/elliptic/p384.go
+++ /dev/null

@@ -1,144 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package elliptic
-
-import (
-	"crypto/elliptic/internal/nistec"
-	"crypto/rand"
-	"math/big"
-)
-
-// p384Curve is a Curve implementation based on nistec.P384Point.
-//
-// It's a wrapper that exposes the big.Int-based Curve interface and encodes the
-// legacy idiosyncrasies it requires, such as invalid and infinity point
-// handling.
-//
-// To interact with the nistec package, points are encoded into and decoded from
-// properly formatted byte slices. All big.Int use is limited to this package.
-// Encoding and decoding is 1/1000th of the runtime of a scalar multiplication,
-// so the overhead is acceptable.
-type p384Curve struct {
-	params *CurveParams
-}
-
-var p384 p384Curve
-var _ Curve = p384
-
-func initP384() {
-	p384.params = &CurveParams{
-		Name:    "P-384",
-		BitSize: 384,
-		// FIPS 186-4, section D.1.2.4
-		P: bigFromDecimal("394020061963944792122790401001436138050797392704654" +
-			"46667948293404245721771496870329047266088258938001861606973112319"),
-		N: bigFromDecimal("394020061963944792122790401001436138050797392704654" +
-			"46667946905279627659399113263569398956308152294913554433653942643"),
-		B: bigFromHex("b3312fa7e23ee7e4988e056be3f82d19181d9c6efe8141120314088" +
-			"f5013875ac656398d8a2ed19d2a85c8edd3ec2aef"),
-		Gx: bigFromHex("aa87ca22be8b05378eb1c71ef320ad746e1d3b628ba79b9859f741" +
-			"e082542a385502f25dbf55296c3a545e3872760ab7"),
-		Gy: bigFromHex("3617de4a96262c6f5d9e98bf9292dc29f8f41dbd289a147ce9da31" +
-			"13b5f0b8c00a60b1ce1d7e819d7a431d7c90ea0e5f"),
-	}
-}
-
-func (curve p384Curve) Params() *CurveParams {
-	return curve.params
-}
-
-func (curve p384Curve) IsOnCurve(x, y *big.Int) bool {
-	// IsOnCurve is documented to reject (0, 0), the conventional point at
-	// infinity, which however is accepted by p384PointFromAffine.
-	if x.Sign() == 0 && y.Sign() == 0 {
-		return false
-	}
-	_, ok := p384PointFromAffine(x, y)
-	return ok
-}
-
-func p384PointFromAffine(x, y *big.Int) (p *nistec.P384Point, ok bool) {
-	// (0, 0) is by convention the point at infinity, which can't be represented
-	// in affine coordinates. Marshal incorrectly encodes it as an uncompressed
-	// point, which SetBytes would correctly reject. See Issue 37294.
-	if x.Sign() == 0 && y.Sign() == 0 {
-		return nistec.NewP384Point(), true
-	}
-	if x.Sign() < 0 || y.Sign() < 0 {
-		return nil, false
-	}
-	if x.BitLen() > 384 || y.BitLen() > 384 {
-		return nil, false
-	}
-	p, err := nistec.NewP384Point().SetBytes(Marshal(P384(), x, y))
-	if err != nil {
-		return nil, false
-	}
-	return p, true
-}
-
-func p384PointToAffine(p *nistec.P384Point) (x, y *big.Int) {
-	out := p.Bytes()
-	if len(out) == 1 && out[0] == 0 {
-		// This is the correct encoding of the point at infinity, which
-		// Unmarshal does not support. See Issue 37294.
-		return new(big.Int), new(big.Int)
-	}
-	x, y = Unmarshal(P384(), out)
-	if x == nil {
-		panic("crypto/elliptic: internal error: Unmarshal rejected a valid point encoding")
-	}
-	return x, y
-}
-
-// p384RandomPoint returns a random point on the curve. It's used when Add,
-// Double, or ScalarMult are fed a point not on the curve, which is undefined
-// behavior. Originally, we used to do the math on it anyway (which allows
-// invalid curve attacks) and relied on the caller and Unmarshal to avoid this
-// happening in the first place. Now, we just can't construct a nistec.P384Point
-// for an invalid pair of coordinates, because that API is safer. If we panic,
-// we risk introducing a DoS. If we return nil, we risk a panic. If we return
-// the input, ecdsa.Verify might fail open. The safest course seems to be to
-// return a valid, random point, which hopefully won't help the attacker.
-func p384RandomPoint() (x, y *big.Int) {
-	_, x, y, err := GenerateKey(P384(), rand.Reader)
-	if err != nil {
-		panic("crypto/elliptic: failed to generate random point")
-	}
-	return x, y
-}
-
-func (p384Curve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
-	p1, ok := p384PointFromAffine(x1, y1)
-	if !ok {
-		return p384RandomPoint()
-	}
-	p2, ok := p384PointFromAffine(x2, y2)
-	if !ok {
-		return p384RandomPoint()
-	}
-	return p384PointToAffine(p1.Add(p1, p2))
-}
-
-func (p384Curve) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
-	p, ok := p384PointFromAffine(x1, y1)
-	if !ok {
-		return p384RandomPoint()
-	}
-	return p384PointToAffine(p.Double(p))
-}
-
-func (p384Curve) ScalarMult(Bx, By *big.Int, scalar []byte) (*big.Int, *big.Int) {
-	p, ok := p384PointFromAffine(Bx, By)
-	if !ok {
-		return p384RandomPoint()
-	}
-	return p384PointToAffine(p.ScalarMult(p, scalar))
-}
-
-func (p384Curve) ScalarBaseMult(scalar []byte) (*big.Int, *big.Int) {
-	p := nistec.NewP384Generator()
-	return p384PointToAffine(p.ScalarMult(p, scalar))
-}

diff --git a/src/crypto/elliptic/p521.go b/src/crypto/elliptic/p521.go
deleted file mode 100644
index 6a3ade3..0000000
--- a/src/crypto/elliptic/p521.go
+++ /dev/null

@@ -1,165 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package elliptic
-
-import (
-	"crypto/elliptic/internal/nistec"
-	"crypto/rand"
-	"math/big"
-)
-
-// p521Curve is a Curve implementation based on nistec.P521Point.
-//
-// It's a wrapper that exposes the big.Int-based Curve interface and encodes the
-// legacy idiosyncrasies it requires, such as invalid and infinity point
-// handling.
-//
-// To interact with the nistec package, points are encoded into and decoded from
-// properly formatted byte slices. All big.Int use is limited to this package.
-// Encoding and decoding is 1/1000th of the runtime of a scalar multiplication,
-// so the overhead is acceptable.
-type p521Curve struct {
-	params *CurveParams
-}
-
-var p521 p521Curve
-var _ Curve = p521
-
-func initP521() {
-	p521.params = &CurveParams{
-		Name:    "P-521",
-		BitSize: 521,
-		// FIPS 186-4, section D.1.2.5
-		P: bigFromDecimal("68647976601306097149819007990813932172694353001433" +
-			"0540939446345918554318339765605212255964066145455497729631139148" +
-			"0858037121987999716643812574028291115057151"),
-		N: bigFromDecimal("68647976601306097149819007990813932172694353001433" +
-			"0540939446345918554318339765539424505774633321719753296399637136" +
-			"3321113864768612440380340372808892707005449"),
-		B: bigFromHex("0051953eb9618e1c9a1f929a21a0b68540eea2da725b99b315f3b8" +
-			"b489918ef109e156193951ec7e937b1652c0bd3bb1bf073573df883d2c34f1ef" +
-			"451fd46b503f00"),
-		Gx: bigFromHex("00c6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f8" +
-			"28af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf9" +
-			"7e7e31c2e5bd66"),
-		Gy: bigFromHex("011839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817" +
-			"afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088" +
-			"be94769fd16650"),
-	}
-}
-
-func (curve p521Curve) Params() *CurveParams {
-	return curve.params
-}
-
-func (curve p521Curve) IsOnCurve(x, y *big.Int) bool {
-	// IsOnCurve is documented to reject (0, 0), the conventional point at
-	// infinity, which however is accepted by p521PointFromAffine.
-	if x.Sign() == 0 && y.Sign() == 0 {
-		return false
-	}
-	_, ok := p521PointFromAffine(x, y)
-	return ok
-}
-
-func p521PointFromAffine(x, y *big.Int) (p *nistec.P521Point, ok bool) {
-	// (0, 0) is by convention the point at infinity, which can't be represented
-	// in affine coordinates. Marshal incorrectly encodes it as an uncompressed
-	// point, which SetBytes would correctly reject. See Issue 37294.
-	if x.Sign() == 0 && y.Sign() == 0 {
-		return nistec.NewP521Point(), true
-	}
-	if x.Sign() < 0 || y.Sign() < 0 {
-		return nil, false
-	}
-	if x.BitLen() > 521 || y.BitLen() > 521 {
-		return nil, false
-	}
-	p, err := nistec.NewP521Point().SetBytes(Marshal(P521(), x, y))
-	if err != nil {
-		return nil, false
-	}
-	return p, true
-}
-
-func p521PointToAffine(p *nistec.P521Point) (x, y *big.Int) {
-	out := p.Bytes()
-	if len(out) == 1 && out[0] == 0 {
-		// This is the correct encoding of the point at infinity, which
-		// Unmarshal does not support. See Issue 37294.
-		return new(big.Int), new(big.Int)
-	}
-	x, y = Unmarshal(P521(), out)
-	if x == nil {
-		panic("crypto/elliptic: internal error: Unmarshal rejected a valid point encoding")
-	}
-	return x, y
-}
-
-// p521RandomPoint returns a random point on the curve. It's used when Add,
-// Double, or ScalarMult are fed a point not on the curve, which is undefined
-// behavior. Originally, we used to do the math on it anyway (which allows
-// invalid curve attacks) and relied on the caller and Unmarshal to avoid this
-// happening in the first place. Now, we just can't construct a nistec.P521Point
-// for an invalid pair of coordinates, because that API is safer. If we panic,
-// we risk introducing a DoS. If we return nil, we risk a panic. If we return
-// the input, ecdsa.Verify might fail open. The safest course seems to be to
-// return a valid, random point, which hopefully won't help the attacker.
-func p521RandomPoint() (x, y *big.Int) {
-	_, x, y, err := GenerateKey(P521(), rand.Reader)
-	if err != nil {
-		panic("crypto/elliptic: failed to generate random point")
-	}
-	return x, y
-}
-
-func (p521Curve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
-	p1, ok := p521PointFromAffine(x1, y1)
-	if !ok {
-		return p521RandomPoint()
-	}
-	p2, ok := p521PointFromAffine(x2, y2)
-	if !ok {
-		return p521RandomPoint()
-	}
-	return p521PointToAffine(p1.Add(p1, p2))
-}
-
-func (p521Curve) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
-	p, ok := p521PointFromAffine(x1, y1)
-	if !ok {
-		return p521RandomPoint()
-	}
-	return p521PointToAffine(p.Double(p))
-}
-
-func (p521Curve) ScalarMult(Bx, By *big.Int, scalar []byte) (*big.Int, *big.Int) {
-	p, ok := p521PointFromAffine(Bx, By)
-	if !ok {
-		return p521RandomPoint()
-	}
-	return p521PointToAffine(p.ScalarMult(p, scalar))
-}
-
-func (p521Curve) ScalarBaseMult(scalar []byte) (*big.Int, *big.Int) {
-	p := nistec.NewP521Generator()
-	return p521PointToAffine(p.ScalarMult(p, scalar))
-}
-
-func bigFromDecimal(s string) *big.Int {
-	b, ok := new(big.Int).SetString(s, 10)
-	if !ok {
-		panic("invalid encoding")
-	}
-	return b
-}
-
-func bigFromHex(s string) *big.Int {
-	b, ok := new(big.Int).SetString(s, 16)
-	if !ok {
-		panic("invalid encoding")
-	}
-	return b
-}

diff --git a/src/crypto/elliptic/params.go b/src/crypto/elliptic/params.go
new file mode 100644
index 0000000..586f2c0
--- /dev/null
+++ b/src/crypto/elliptic/params.go

@@ -0,0 +1,296 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package elliptic
+
+import "math/big"
+
+// CurveParams contains the parameters of an elliptic curve and also provides
+// a generic, non-constant time implementation of Curve.
+type CurveParams struct {
+	P       *big.Int // the order of the underlying field
+	N       *big.Int // the order of the base point
+	B       *big.Int // the constant of the curve equation
+	Gx, Gy  *big.Int // (x,y) of the base point
+	BitSize int      // the size of the underlying field
+	Name    string   // the canonical name of the curve
+}
+
+func (curve *CurveParams) Params() *CurveParams {
+	return curve
+}
+
+// CurveParams operates, internally, on Jacobian coordinates. For a given
+// (x, y) position on the curve, the Jacobian coordinates are (x1, y1, z1)
+// where x = x1/z1² and y = y1/z1³. The greatest speedups come when the whole
+// calculation can be performed within the transform (as in ScalarMult and
+// ScalarBaseMult). But even for Add and Double, it's faster to apply and
+// reverse the transform than to operate in affine coordinates.
+
+// polynomial returns x³ - 3x + b.
+func (curve *CurveParams) polynomial(x *big.Int) *big.Int {
+	x3 := new(big.Int).Mul(x, x)
+	x3.Mul(x3, x)
+
+	threeX := new(big.Int).Lsh(x, 1)
+	threeX.Add(threeX, x)
+
+	x3.Sub(x3, threeX)
+	x3.Add(x3, curve.B)
+	x3.Mod(x3, curve.P)
+
+	return x3
+}
+
+func (curve *CurveParams) IsOnCurve(x, y *big.Int) bool {
+	// If there is a dedicated constant-time implementation for this curve operation,
+	// use that instead of the generic one.
+	if specific, ok := matchesSpecificCurve(curve, p224, p384, p521); ok {
+		return specific.IsOnCurve(x, y)
+	}
+
+	if x.Sign() < 0 || x.Cmp(curve.P) >= 0 ||
+		y.Sign() < 0 || y.Cmp(curve.P) >= 0 {
+		return false
+	}
+
+	// y² = x³ - 3x + b
+	y2 := new(big.Int).Mul(y, y)
+	y2.Mod(y2, curve.P)
+
+	return curve.polynomial(x).Cmp(y2) == 0
+}
+
+// zForAffine returns a Jacobian Z value for the affine point (x, y). If x and
+// y are zero, it assumes that they represent the point at infinity because (0,
+// 0) is not on the any of the curves handled here.
+func zForAffine(x, y *big.Int) *big.Int {
+	z := new(big.Int)
+	if x.Sign() != 0 || y.Sign() != 0 {
+		z.SetInt64(1)
+	}
+	return z
+}
+
+// affineFromJacobian reverses the Jacobian transform. See the comment at the
+// top of the file. If the point is ∞ it returns 0, 0.
+func (curve *CurveParams) affineFromJacobian(x, y, z *big.Int) (xOut, yOut *big.Int) {
+	if z.Sign() == 0 {
+		return new(big.Int), new(big.Int)
+	}
+
+	zinv := new(big.Int).ModInverse(z, curve.P)
+	zinvsq := new(big.Int).Mul(zinv, zinv)
+
+	xOut = new(big.Int).Mul(x, zinvsq)
+	xOut.Mod(xOut, curve.P)
+	zinvsq.Mul(zinvsq, zinv)
+	yOut = new(big.Int).Mul(y, zinvsq)
+	yOut.Mod(yOut, curve.P)
+	return
+}
+
+func (curve *CurveParams) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
+	// If there is a dedicated constant-time implementation for this curve operation,
+	// use that instead of the generic one.
+	if specific, ok := matchesSpecificCurve(curve, p224, p384, p521); ok {
+		return specific.Add(x1, y1, x2, y2)
+	}
+
+	z1 := zForAffine(x1, y1)
+	z2 := zForAffine(x2, y2)
+	return curve.affineFromJacobian(curve.addJacobian(x1, y1, z1, x2, y2, z2))
+}
+
+// addJacobian takes two points in Jacobian coordinates, (x1, y1, z1) and
+// (x2, y2, z2) and returns their sum, also in Jacobian form.
+func (curve *CurveParams) addJacobian(x1, y1, z1, x2, y2, z2 *big.Int) (*big.Int, *big.Int, *big.Int) {
+	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
+	x3, y3, z3 := new(big.Int), new(big.Int), new(big.Int)
+	if z1.Sign() == 0 {
+		x3.Set(x2)
+		y3.Set(y2)
+		z3.Set(z2)
+		return x3, y3, z3
+	}
+	if z2.Sign() == 0 {
+		x3.Set(x1)
+		y3.Set(y1)
+		z3.Set(z1)
+		return x3, y3, z3
+	}
+
+	z1z1 := new(big.Int).Mul(z1, z1)
+	z1z1.Mod(z1z1, curve.P)
+	z2z2 := new(big.Int).Mul(z2, z2)
+	z2z2.Mod(z2z2, curve.P)
+
+	u1 := new(big.Int).Mul(x1, z2z2)
+	u1.Mod(u1, curve.P)
+	u2 := new(big.Int).Mul(x2, z1z1)
+	u2.Mod(u2, curve.P)
+	h := new(big.Int).Sub(u2, u1)
+	xEqual := h.Sign() == 0
+	if h.Sign() == -1 {
+		h.Add(h, curve.P)
+	}
+	i := new(big.Int).Lsh(h, 1)
+	i.Mul(i, i)
+	j := new(big.Int).Mul(h, i)
+
+	s1 := new(big.Int).Mul(y1, z2)
+	s1.Mul(s1, z2z2)
+	s1.Mod(s1, curve.P)
+	s2 := new(big.Int).Mul(y2, z1)
+	s2.Mul(s2, z1z1)
+	s2.Mod(s2, curve.P)
+	r := new(big.Int).Sub(s2, s1)
+	if r.Sign() == -1 {
+		r.Add(r, curve.P)
+	}
+	yEqual := r.Sign() == 0
+	if xEqual && yEqual {
+		return curve.doubleJacobian(x1, y1, z1)
+	}
+	r.Lsh(r, 1)
+	v := new(big.Int).Mul(u1, i)
+
+	x3.Set(r)
+	x3.Mul(x3, x3)
+	x3.Sub(x3, j)
+	x3.Sub(x3, v)
+	x3.Sub(x3, v)
+	x3.Mod(x3, curve.P)
+
+	y3.Set(r)
+	v.Sub(v, x3)
+	y3.Mul(y3, v)
+	s1.Mul(s1, j)
+	s1.Lsh(s1, 1)
+	y3.Sub(y3, s1)
+	y3.Mod(y3, curve.P)
+
+	z3.Add(z1, z2)
+	z3.Mul(z3, z3)
+	z3.Sub(z3, z1z1)
+	z3.Sub(z3, z2z2)
+	z3.Mul(z3, h)
+	z3.Mod(z3, curve.P)
+
+	return x3, y3, z3
+}
+
+func (curve *CurveParams) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
+	// If there is a dedicated constant-time implementation for this curve operation,
+	// use that instead of the generic one.
+	if specific, ok := matchesSpecificCurve(curve, p224, p384, p521); ok {
+		return specific.Double(x1, y1)
+	}
+
+	z1 := zForAffine(x1, y1)
+	return curve.affineFromJacobian(curve.doubleJacobian(x1, y1, z1))
+}
+
+// doubleJacobian takes a point in Jacobian coordinates, (x, y, z), and
+// returns its double, also in Jacobian form.
+func (curve *CurveParams) doubleJacobian(x, y, z *big.Int) (*big.Int, *big.Int, *big.Int) {
+	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
+	delta := new(big.Int).Mul(z, z)
+	delta.Mod(delta, curve.P)
+	gamma := new(big.Int).Mul(y, y)
+	gamma.Mod(gamma, curve.P)
+	alpha := new(big.Int).Sub(x, delta)
+	if alpha.Sign() == -1 {
+		alpha.Add(alpha, curve.P)
+	}
+	alpha2 := new(big.Int).Add(x, delta)
+	alpha.Mul(alpha, alpha2)
+	alpha2.Set(alpha)
+	alpha.Lsh(alpha, 1)
+	alpha.Add(alpha, alpha2)
+
+	beta := alpha2.Mul(x, gamma)
+
+	x3 := new(big.Int).Mul(alpha, alpha)
+	beta8 := new(big.Int).Lsh(beta, 3)
+	beta8.Mod(beta8, curve.P)
+	x3.Sub(x3, beta8)
+	if x3.Sign() == -1 {
+		x3.Add(x3, curve.P)
+	}
+	x3.Mod(x3, curve.P)
+
+	z3 := new(big.Int).Add(y, z)
+	z3.Mul(z3, z3)
+	z3.Sub(z3, gamma)
+	if z3.Sign() == -1 {
+		z3.Add(z3, curve.P)
+	}
+	z3.Sub(z3, delta)
+	if z3.Sign() == -1 {
+		z3.Add(z3, curve.P)
+	}
+	z3.Mod(z3, curve.P)
+
+	beta.Lsh(beta, 2)
+	beta.Sub(beta, x3)
+	if beta.Sign() == -1 {
+		beta.Add(beta, curve.P)
+	}
+	y3 := alpha.Mul(alpha, beta)
+
+	gamma.Mul(gamma, gamma)
+	gamma.Lsh(gamma, 3)
+	gamma.Mod(gamma, curve.P)
+
+	y3.Sub(y3, gamma)
+	if y3.Sign() == -1 {
+		y3.Add(y3, curve.P)
+	}
+	y3.Mod(y3, curve.P)
+
+	return x3, y3, z3
+}
+
+func (curve *CurveParams) ScalarMult(Bx, By *big.Int, k []byte) (*big.Int, *big.Int) {
+	// If there is a dedicated constant-time implementation for this curve operation,
+	// use that instead of the generic one.
+	if specific, ok := matchesSpecificCurve(curve, p224, p256, p384, p521); ok {
+		return specific.ScalarMult(Bx, By, k)
+	}
+
+	Bz := new(big.Int).SetInt64(1)
+	x, y, z := new(big.Int), new(big.Int), new(big.Int)
+
+	for _, byte := range k {
+		for bitNum := 0; bitNum < 8; bitNum++ {
+			x, y, z = curve.doubleJacobian(x, y, z)
+			if byte&0x80 == 0x80 {
+				x, y, z = curve.addJacobian(Bx, By, Bz, x, y, z)
+			}
+			byte <<= 1
+		}
+	}
+
+	return curve.affineFromJacobian(x, y, z)
+}
+
+func (curve *CurveParams) ScalarBaseMult(k []byte) (*big.Int, *big.Int) {
+	// If there is a dedicated constant-time implementation for this curve operation,
+	// use that instead of the generic one.
+	if specific, ok := matchesSpecificCurve(curve, p224, p256, p384, p521); ok {
+		return specific.ScalarBaseMult(k)
+	}
+
+	return curve.ScalarMult(curve.Gx, curve.Gy, k)
+}
+
+func matchesSpecificCurve(params *CurveParams, available ...Curve) (Curve, bool) {
+	for _, c := range available {
+		if params == c.Params() {
+			return c, true
+		}
+	}
+	return nil, false
+}

diff --git a/src/crypto/rand/rand_getrandom.go b/src/crypto/rand/rand_getrandom.go
index cb31a56..478aa5c 100644
--- a/src/crypto/rand/rand_getrandom.go
+++ b/src/crypto/rand/rand_getrandom.go

@@ -21,7 +21,7 @@
 		//     is returned by a single call to getrandom() on systems where int
 		//     has a size of 32 bits.
 		maxGetRandomRead = (1 << 25) - 1
-	case "freebsd", "dragonfly", "solaris":
+	case "freebsd", "dragonfly", "solaris", "illumos":
 		maxGetRandomRead = 1 << 8
 	default:
 		panic("no maximum specified for GetRandom")

diff --git a/src/crypto/tls/common.go b/src/crypto/tls/common.go
index fdcebd8..1861efc 100644
--- a/src/crypto/tls/common.go
+++ b/src/crypto/tls/common.go

@@ -18,7 +18,6 @@
 	"crypto/x509"
 	"errors"
 	"fmt"
-	"internal/godebug"
 	"io"
 	"net"
 	"strings"
@@ -977,9 +976,6 @@
 	VersionTLS10,
 }
 
-// debugEnableTLS10 enables TLS 1.0. See issue 45428.
-var debugEnableTLS10 = godebug.Get("tls10default") == "1"
-
 // roleClient and roleServer are meant to call supportedVersions and parents
 // with more readability at the callsite.
 const roleClient = true
@@ -991,7 +987,7 @@
 		if needFIPS() && (v < fipsMinVersion(c) || v > fipsMaxVersion(c)) {
 			continue
 		}
-		if (c == nil || c.MinVersion == 0) && !debugEnableTLS10 &&
+		if (c == nil || c.MinVersion == 0) &&
 			isClient && v < VersionTLS12 {
 			continue
 		}

diff --git a/src/crypto/tls/handshake_messages.go b/src/crypto/tls/handshake_messages.go
index 17cf859..7ab0f10 100644
--- a/src/crypto/tls/handshake_messages.go
+++ b/src/crypto/tls/handshake_messages.go

@@ -384,6 +384,7 @@
 		return false
 	}
 
+	seenExts := make(map[uint16]bool)
 	for !extensions.Empty() {
 		var extension uint16
 		var extData cryptobyte.String
@@ -392,6 +393,11 @@
 			return false
 		}
 
+		if seenExts[extension] {
+			return false
+		}
+		seenExts[extension] = true
+
 		switch extension {
 		case extensionServerName:
 			// RFC 6066, Section 3
@@ -750,6 +756,7 @@
 		return false
 	}
 
+	seenExts := make(map[uint16]bool)
 	for !extensions.Empty() {
 		var extension uint16
 		var extData cryptobyte.String
@@ -758,6 +765,11 @@
 			return false
 		}
 
+		if seenExts[extension] {
+			return false
+		}
+		seenExts[extension] = true
+
 		switch extension {
 		case extensionStatusRequest:
 			m.ocspStapling = true

diff --git a/src/crypto/tls/handshake_messages_test.go b/src/crypto/tls/handshake_messages_test.go
index 2f5d0e4..c6fc8f2 100644
--- a/src/crypto/tls/handshake_messages_test.go
+++ b/src/crypto/tls/handshake_messages_test.go

@@ -6,6 +6,7 @@
 
 import (
 	"bytes"
+	"encoding/hex"
 	"math/rand"
 	"reflect"
 	"strings"
@@ -463,3 +464,23 @@
 		t.Fatal("Unmarshaled ServerHello with zero-length SCT")
 	}
 }
+
+func TestRejectDuplicateExtensions(t *testing.T) {
+	clientHelloBytes, err := hex.DecodeString("010000440303000000000000000000000000000000000000000000000000000000000000000000000000001c0000000a000800000568656c6c6f0000000a000800000568656c6c6f")
+	if err != nil {
+		t.Fatalf("failed to decode test ClientHello: %s", err)
+	}
+	var clientHelloCopy clientHelloMsg
+	if clientHelloCopy.unmarshal(clientHelloBytes) {
+		t.Error("Unmarshaled ClientHello with duplicate extensions")
+	}
+
+	serverHelloBytes, err := hex.DecodeString("02000030030300000000000000000000000000000000000000000000000000000000000000000000000000080005000000050000")
+	if err != nil {
+		t.Fatalf("failed to decode test ServerHello: %s", err)
+	}
+	var serverHelloCopy serverHelloMsg
+	if serverHelloCopy.unmarshal(serverHelloBytes) {
+		t.Fatal("Unmarshaled ServerHello with duplicate extensions")
+	}
+}

diff --git a/src/crypto/tls/handshake_server_test.go b/src/crypto/tls/handshake_server_test.go
index 16a2254..1f3a174 100644
--- a/src/crypto/tls/handshake_server_test.go
+++ b/src/crypto/tls/handshake_server_test.go

@@ -400,16 +400,6 @@
 	if err == nil {
 		t.Fatalf("expected failure to connect with TLS 1.0/1.1")
 	}
-
-	defer func(old bool) { debugEnableTLS10 = old }(debugEnableTLS10)
-	debugEnableTLS10 = true
-	_, _, err = testHandshake(t, clientConfig, serverConfig)
-	if err != nil {
-		t.Fatalf("handshake failed: %s", err)
-	}
-	if state.Version != VersionTLS11 {
-		t.Fatalf("incorrect version %x, should be %x", state.Version, VersionTLS11)
-	}
 }
 
 func TestCipherSuitePreference(t *testing.T) {

diff --git a/src/crypto/x509/verify.go b/src/crypto/x509/verify.go
index 218d794..a44f5d6 100644
--- a/src/crypto/x509/verify.go
+++ b/src/crypto/x509/verify.go

@@ -7,6 +7,7 @@
 import (
 	"bytes"
 	"crypto"
+	"crypto/x509/pkix"
 	"errors"
 	"fmt"
 	"net"
@@ -837,6 +838,50 @@
 	return n
 }
 
+// alreadyInChain checks whether a candidate certificate is present in a chain.
+// Rather than doing a direct byte for byte equivalency check, we check if the
+// subject, public key, and SAN, if present, are equal. This prevents loops that
+// are created by mutual cross-signatures, or other cross-signature bridge
+// oddities.
+func alreadyInChain(candidate *Certificate, chain []*Certificate) bool {
+	type pubKeyEqual interface {
+		Equal(crypto.PublicKey) bool
+	}
+
+	var candidateSAN *pkix.Extension
+	for _, ext := range candidate.Extensions {
+		if ext.Id.Equal(oidExtensionSubjectAltName) {
+			candidateSAN = &ext
+			break
+		}
+	}
+
+	for _, cert := range chain {
+		if !bytes.Equal(candidate.RawSubject, cert.RawSubject) {
+			continue
+		}
+		if !candidate.PublicKey.(pubKeyEqual).Equal(cert.PublicKey) {
+			continue
+		}
+		var certSAN *pkix.Extension
+		for _, ext := range cert.Extensions {
+			if ext.Id.Equal(oidExtensionSubjectAltName) {
+				certSAN = &ext
+				break
+			}
+		}
+		if candidateSAN == nil && certSAN == nil {
+			return true
+		} else if candidateSAN == nil || certSAN == nil {
+			return false
+		}
+		if bytes.Equal(candidateSAN.Value, certSAN.Value) {
+			return true
+		}
+	}
+	return false
+}
+
 // maxChainSignatureChecks is the maximum number of CheckSignatureFrom calls
 // that an invocation of buildChains will (transitively) make. Most chains are
 // less than 15 certificates long, so this leaves space for multiple chains and
@@ -849,18 +894,9 @@
 		hintCert *Certificate
 	)
 
-	type pubKeyEqual interface {
-		Equal(crypto.PublicKey) bool
-	}
-
 	considerCandidate := func(certType int, candidate *Certificate) {
-		for _, cert := range currentChain {
-			// If a certificate already appeared in the chain we've built, don't
-			// reconsider it. This prevents loops, for isntance those created by
-			// mutual cross-signatures, or other cross-signature bridges oddities.
-			if bytes.Equal(cert.RawSubject, candidate.RawSubject) && cert.PublicKey.(pubKeyEqual).Equal(candidate.PublicKey) {
-				return
-			}
+		if alreadyInChain(candidate, currentChain) {
+			return
 		}
 
 		if sigChecks == nil {

diff --git a/src/crypto/x509/verify_test.go b/src/crypto/x509/verify_test.go
index 1b2cbe3..8a7b08a 100644
--- a/src/crypto/x509/verify_test.go
+++ b/src/crypto/x509/verify_test.go

@@ -2340,6 +2340,29 @@
 				"CN=leaf -> CN=inter b -> CN=inter c -> CN=root",
 			},
 		},
+		{
+			// Build a simple two node graph, where the leaf is directly issued from
+			// the root and both certificates have matching subject and public key, but
+			// the leaf has SANs.
+			name: "leaf with same subject, key, as parent but with SAN",
+			graph: trustGraphDescription{
+				Roots: []string{"root"},
+				Leaf:  "root",
+				Graph: []trustGraphEdge{
+					{
+						Issuer:  "root",
+						Subject: "root",
+						Type:    leafCertificate,
+						MutateTemplate: func(c *Certificate) {
+							c.DNSNames = []string{"localhost"}
+						},
+					},
+				},
+			},
+			expectedChains: []string{
+				"CN=root -> CN=root",
+			},
+		},
 	}
 
 	for _, tc := range tests {

diff --git a/src/crypto/x509/x509.go b/src/crypto/x509/x509.go
index ceb04ae..582e1b1 100644
--- a/src/crypto/x509/x509.go
+++ b/src/crypto/x509/x509.go

@@ -1478,21 +1478,14 @@
 		return nil, errors.New("x509: no SerialNumber given")
 	}
 
-	// RFC 5280 Section 4.1.2.2: serial number must positive and should not be longer
-	// than 20 octets.
+	// RFC 5280 Section 4.1.2.2: serial number must positive
 	//
-	// We cannot simply check for len(serialBytes) > 20, because encoding/asn1 may
-	// pad the slice in order to prevent the integer being mistaken for a negative
-	// number (DER uses the high bit of the left-most byte to indicate the sign.),
-	// so we need to double check the composition of the serial if it is exactly
-	// 20 bytes.
+	// We _should_ also restrict serials to <= 20 octets, but it turns out a lot of people
+	// get this wrong, in part because the encoding can itself alter the length of the
+	// serial. For now we accept these non-conformant serials.
 	if template.SerialNumber.Sign() == -1 {
 		return nil, errors.New("x509: serial number must be positive")
 	}
-	serialBytes := template.SerialNumber.Bytes()
-	if len(serialBytes) > 20 || (len(serialBytes) == 20 && serialBytes[0]&0x80 != 0) {
-		return nil, errors.New("x509: serial number exceeds 20 octets")
-	}
 
 	if template.BasicConstraintsValid && !template.IsCA && template.MaxPathLen != -1 && (template.MaxPathLen != 0 || template.MaxPathLenZero) {
 		return nil, errors.New("x509: only CAs are allowed to specify MaxPathLen")

diff --git a/src/crypto/x509/x509_test.go b/src/crypto/x509/x509_test.go
index 486d6bf..f68dd02 100644
--- a/src/crypto/x509/x509_test.go
+++ b/src/crypto/x509/x509_test.go

@@ -3589,42 +3589,6 @@
 	}
 }
 
-func TestCreateCertificateLongSerial(t *testing.T) {
-	k, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	serialBytes := make([]byte, 21)
-	serialBytes[0] = 0x80
-	serialBytes[20] = 1
-	tooLong := big.NewInt(0).SetBytes(serialBytes)
-
-	tmpl := &Certificate{
-		SerialNumber: tooLong,
-		Subject: pkix.Name{
-			CommonName: ":)",
-		},
-		NotAfter:  time.Now().Add(time.Hour),
-		NotBefore: time.Now().Add(-time.Hour),
-	}
-
-	expectedErr := "x509: serial number exceeds 20 octets"
-
-	_, err = CreateCertificate(rand.Reader, tmpl, tmpl, k.Public(), k)
-	if err == nil || err.Error() != expectedErr {
-		t.Errorf("CreateCertificate returned unexpected error: want %q, got %q", expectedErr, err)
-	}
-
-	serialBytes = serialBytes[:20]
-	tmpl.SerialNumber = big.NewInt(0).SetBytes(serialBytes)
-
-	_, err = CreateCertificate(rand.Reader, tmpl, tmpl, k.Public(), k)
-	if err == nil || err.Error() != expectedErr {
-		t.Errorf("CreateCertificate returned unexpected error: want %q, got %q", expectedErr, err)
-	}
-}
-
 var negativeSerialCert = `-----BEGIN CERTIFICATE-----
 MIIBBTCBraADAgECAgH/MAoGCCqGSM49BAMCMA0xCzAJBgNVBAMTAjopMB4XDTIy
 MDQxNDIzNTYwNFoXDTIyMDQxNTAxNTYwNFowDTELMAkGA1UEAxMCOikwWTATBgcq

diff --git a/src/debug/pe/symbol.go b/src/debug/pe/symbol.go
index 0dfd5d9..323fa8c 100644
--- a/src/debug/pe/symbol.go
+++ b/src/debug/pe/symbol.go

@@ -136,10 +136,9 @@
 // auxiliary symbols: https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#auxiliary-symbol-records
 // COMDAT sections: https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#comdat-sections-object-only
 // auxiliary info for section definitions: https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#auxiliary-format-5-section-definitions
-//
 func (f *File) COFFSymbolReadSectionDefAux(idx int) (*COFFSymbolAuxFormat5, error) {
 	var rv *COFFSymbolAuxFormat5
-	if idx < 0 || idx > len(f.COFFSymbols) {
+	if idx < 0 || idx >= len(f.COFFSymbols) {
 		return rv, fmt.Errorf("invalid symbol index")
 	}
 	pesym := &f.COFFSymbols[idx]

diff --git a/src/go/build/build.go b/src/go/build/build.go
index c67d6d0..f40b486 100644
--- a/src/go/build/build.go
+++ b/src/go/build/build.go

@@ -1185,20 +1185,13 @@
 	if ctxt.CgoEnabled {
 		cgo = "1"
 	}
-	cmd.Env = append(os.Environ(),
+	cmd.Env = append(cmd.Environ(),
 		"GOOS="+ctxt.GOOS,
 		"GOARCH="+ctxt.GOARCH,
 		"GOROOT="+ctxt.GOROOT,
 		"GOPATH="+ctxt.GOPATH,
 		"CGO_ENABLED="+cgo,
 	)
-	if cmd.Dir != "" {
-		// If possible, set PWD: if an error occurs and PWD includes a symlink, we
-		// want the error to refer to Dir, not some other name for it.
-		if abs, err := filepath.Abs(cmd.Dir); err == nil {
-			cmd.Env = append(cmd.Env, "PWD="+abs)
-		}
-	}
 
 	if err := cmd.Run(); err != nil {
 		return fmt.Errorf("go/build: go list %s: %v\n%s\n", path, err, stderr.String())

diff --git a/src/go/format/format.go b/src/go/format/format.go
index fb87e84..3837cb4 100644
--- a/src/go/format/format.go
+++ b/src/go/format/format.go

@@ -38,7 +38,7 @@
 
 var config = printer.Config{Mode: printerMode, Tabwidth: tabWidth}
 
-const parserMode = parser.ParseComments
+const parserMode = parser.ParseComments | parser.SkipObjectResolution
 
 // Node formats node in canonical gofmt style and writes the result to dst.
 //

diff --git a/src/go/internal/srcimporter/srcimporter.go b/src/go/internal/srcimporter/srcimporter.go
index d7ec669..ea6f012 100644
--- a/src/go/internal/srcimporter/srcimporter.go
+++ b/src/go/internal/srcimporter/srcimporter.go

@@ -136,7 +136,7 @@
 			setUsesCgo(&conf)
 			file, err := p.cgo(bp)
 			if err != nil {
-				return nil, err
+				return nil, fmt.Errorf("error processing cgo for package %q: %w", bp.ImportPath, err)
 			}
 			files = append(files, file)
 		}
@@ -223,9 +223,9 @@
 	args = append(args, bp.CgoCPPFLAGS...)
 	if len(bp.CgoPkgConfig) > 0 {
 		cmd := exec.Command("pkg-config", append([]string{"--cflags"}, bp.CgoPkgConfig...)...)
-		out, err := cmd.CombinedOutput()
+		out, err := cmd.Output()
 		if err != nil {
-			return nil, err
+			return nil, fmt.Errorf("pkg-config --cflags: %w", err)
 		}
 		args = append(args, strings.Fields(string(out))...)
 	}
@@ -237,7 +237,7 @@
 	cmd := exec.Command(args[0], args[1:]...)
 	cmd.Dir = bp.Dir
 	if err := cmd.Run(); err != nil {
-		return nil, err
+		return nil, fmt.Errorf("go tool cgo: %w", err)
 	}
 
 	return parser.ParseFile(p.fset, filepath.Join(tmpdir, "_cgo_gotypes.go"), nil, 0)

diff --git a/src/go/types/api.go b/src/go/types/api.go
index 4f63d627..0915d6a 100644
--- a/src/go/types/api.go
+++ b/src/go/types/api.go

@@ -414,9 +414,9 @@
 // AssertableTo reports whether a value of type V can be asserted to have type T.
 //
 // The behavior of AssertableTo is undefined in two cases:
-//  - if V is a generalized interface; i.e., an interface that may only be used
-//    as a type constraint in Go code
-//  - if T is an uninstantiated generic type
+//   - if V is a generalized interface; i.e., an interface that may only be used
+//     as a type constraint in Go code
+//   - if T is an uninstantiated generic type
 func AssertableTo(V *Interface, T Type) bool {
 	// Checker.newAssertableTo suppresses errors for invalid types, so we need special
 	// handling here.

diff --git a/src/go/types/assignments.go b/src/go/types/assignments.go
index 101e868..98d7563 100644
--- a/src/go/types/assignments.go
+++ b/src/go/types/assignments.go

@@ -9,6 +9,7 @@
 import (
 	"fmt"
 	"go/ast"
+	"go/token"
 	"strings"
 )
 
@@ -339,11 +340,10 @@
 			} else if len(rhs) > 0 {
 				at = rhs[len(rhs)-1].expr // report at last value
 			}
-			check.errorf(at, _WrongResultCount, "%s return values\n\thave %s\n\twant %s",
-				qualifier,
-				check.typesSummary(operandTypes(rhs), false),
-				check.typesSummary(varTypes(lhs), false),
-			)
+			err := newErrorf(at, _WrongResultCount, "%s return values", qualifier)
+			err.errorf(token.NoPos, "have %s", check.typesSummary(operandTypes(rhs), false))
+			err.errorf(token.NoPos, "want %s", check.typesSummary(varTypes(lhs), false))
+			check.report(err)
 			return
 		}
 		if compilerErrorMessages {

diff --git a/src/go/types/call.go b/src/go/types/call.go
index 5160317..3c7c322 100644
--- a/src/go/types/call.go
+++ b/src/go/types/call.go

@@ -368,11 +368,10 @@
 		if sig.params != nil {
 			params = sig.params.vars
 		}
-		check.errorf(at, _WrongArgCount, "%s arguments in call to %s\n\thave %s\n\twant %s",
-			qualifier, call.Fun,
-			check.typesSummary(operandTypes(args), false),
-			check.typesSummary(varTypes(params), sig.variadic),
-		)
+		err := newErrorf(at, _WrongArgCount, "%s arguments in call to %s", qualifier, call.Fun)
+		err.errorf(token.NoPos, "have %s", check.typesSummary(operandTypes(args), false))
+		err.errorf(token.NoPos, "want %s", check.typesSummary(varTypes(params), sig.variadic))
+		check.report(err)
 		return
 	}
 

diff --git a/src/go/types/conversions.go b/src/go/types/conversions.go
index 65691cf..362c8fd 100644
--- a/src/go/types/conversions.go
+++ b/src/go/types/conversions.go

@@ -8,6 +8,7 @@
 
 import (
 	"go/constant"
+	"go/token"
 	"unicode"
 )
 
@@ -74,7 +75,9 @@
 		if compilerErrorMessages {
 			if cause != "" {
 				// Add colon at end of line if we have a following cause.
-				check.errorf(x, _InvalidConversion, "cannot convert %s to type %s:\n\t%s", x, T, cause)
+				err := newErrorf(x, _InvalidConversion, "cannot convert %s to type %s:", x, T)
+				err.errorf(token.NoPos, cause)
+				check.report(err)
 			} else {
 				check.errorf(x, _InvalidConversion, "cannot convert %s to type %s", x, T)
 			}

diff --git a/src/go/types/errors.go b/src/go/types/errors.go
index fade863..0dc0bc8 100644
--- a/src/go/types/errors.go
+++ b/src/go/types/errors.go

@@ -8,7 +8,6 @@
 
 import (
 	"bytes"
-	"errors"
 	"fmt"
 	"go/ast"
 	"go/token"
@@ -26,6 +25,64 @@
 	panic("unreachable")
 }
 
+// An error_ represents a type-checking error.
+// To report an error_, call Checker.report.
+type error_ struct {
+	desc []errorDesc
+	code errorCode
+	soft bool // TODO(gri) eventually determine this from an error code
+}
+
+// An errorDesc describes part of a type-checking error.
+type errorDesc struct {
+	posn   positioner
+	format string
+	args   []interface{}
+}
+
+func (err *error_) empty() bool {
+	return err.desc == nil
+}
+
+func (err *error_) pos() token.Pos {
+	if err.empty() {
+		return token.NoPos
+	}
+	return err.desc[0].posn.Pos()
+}
+
+func (err *error_) msg(fset *token.FileSet, qf Qualifier) string {
+	if err.empty() {
+		return "no error"
+	}
+	var buf bytes.Buffer
+	for i := range err.desc {
+		p := &err.desc[i]
+		if i > 0 {
+			fmt.Fprint(&buf, "\n\t")
+			if p.posn.Pos().IsValid() {
+				fmt.Fprintf(&buf, "%s: ", fset.Position(p.posn.Pos()))
+			}
+		}
+		buf.WriteString(sprintf(fset, qf, false, p.format, p.args...))
+	}
+	return buf.String()
+}
+
+// String is for testing.
+func (err *error_) String() string {
+	if err.empty() {
+		return "no error"
+	}
+	return fmt.Sprintf("%d: %s", err.pos(), err.msg(nil, nil))
+}
+
+// errorf adds formatted error information to err.
+// It may be called multiple times to provide additional information.
+func (err *error_) errorf(at token.Pos, format string, args ...interface{}) {
+	err.desc = append(err.desc, errorDesc{atPos(at), format, args})
+}
+
 func (check *Checker) qualifier(pkg *Package) string {
 	// Qualify the package unless it's the package being type-checked.
 	if pkg != check.pkg {
@@ -140,36 +197,46 @@
 	fmt.Println(sprintf(check.fset, check.qualifier, true, format, args...))
 }
 
-func (check *Checker) err(err error) {
-	if err == nil {
-		return
+// Report records the error pointed to by errp, setting check.firstError if
+// necessary.
+func (check *Checker) report(errp *error_) {
+	if errp.empty() {
+		panic("empty error details")
 	}
-	var e Error
-	isInternal := errors.As(err, &e)
+
+	span := spanOf(errp.desc[0].posn)
+	e := Error{
+		Fset:       check.fset,
+		Pos:        span.pos,
+		Msg:        errp.msg(check.fset, check.qualifier),
+		Soft:       errp.soft,
+		go116code:  errp.code,
+		go116start: span.start,
+		go116end:   span.end,
+	}
+
 	// Cheap trick: Don't report errors with messages containing
 	// "invalid operand" or "invalid type" as those tend to be
 	// follow-on errors which don't add useful information. Only
 	// exclude them if these strings are not at the beginning,
 	// and only if we have at least one error already reported.
-	isInvalidErr := isInternal && (strings.Index(e.Msg, "invalid operand") > 0 || strings.Index(e.Msg, "invalid type") > 0)
+	isInvalidErr := strings.Index(e.Msg, "invalid operand") > 0 || strings.Index(e.Msg, "invalid type") > 0
 	if check.firstErr != nil && isInvalidErr {
 		return
 	}
 
-	if isInternal {
-		e.Msg = stripAnnotations(e.Msg)
-		if check.errpos != nil {
-			// If we have an internal error and the errpos override is set, use it to
-			// augment our error positioning.
-			// TODO(rFindley) we may also want to augment the error message and refer
-			// to the position (pos) in the original expression.
-			span := spanOf(check.errpos)
-			e.Pos = span.pos
-			e.go116start = span.start
-			e.go116end = span.end
-		}
-		err = e
+	e.Msg = stripAnnotations(e.Msg)
+	if check.errpos != nil {
+		// If we have an internal error and the errpos override is set, use it to
+		// augment our error positioning.
+		// TODO(rFindley) we may also want to augment the error message and refer
+		// to the position (pos) in the original expression.
+		span := spanOf(check.errpos)
+		e.Pos = span.pos
+		e.go116start = span.start
+		e.go116end = span.end
 	}
+	err := e
 
 	if check.firstErr == nil {
 		check.firstErr = err
@@ -178,10 +245,6 @@
 	if trace {
 		pos := e.Pos
 		msg := e.Msg
-		if !isInternal {
-			msg = err.Error()
-			pos = token.NoPos
-		}
 		check.trace(pos, "ERROR: %s", msg)
 	}
 
@@ -192,35 +255,26 @@
 	f(err)
 }
 
-func (check *Checker) newError(at positioner, code errorCode, soft bool, msg string) error {
-	span := spanOf(at)
-	return Error{
-		Fset:       check.fset,
-		Pos:        span.pos,
-		Msg:        msg,
-		Soft:       soft,
-		go116code:  code,
-		go116start: span.start,
-		go116end:   span.end,
+// newErrorf creates a new error_ for later reporting with check.report.
+func newErrorf(at positioner, code errorCode, format string, args ...any) *error_ {
+	return &error_{
+		desc: []errorDesc{{at, format, args}},
+		code: code,
 	}
 }
 
-// newErrorf creates a new Error, but does not handle it.
-func (check *Checker) newErrorf(at positioner, code errorCode, soft bool, format string, args ...any) error {
-	msg := check.sprintf(format, args...)
-	return check.newError(at, code, soft, msg)
-}
-
 func (check *Checker) error(at positioner, code errorCode, msg string) {
-	check.err(check.newError(at, code, false, msg))
+	check.report(newErrorf(at, code, msg))
 }
 
 func (check *Checker) errorf(at positioner, code errorCode, format string, args ...any) {
-	check.error(at, code, check.sprintf(format, args...))
+	check.report(newErrorf(at, code, format, args...))
 }
 
 func (check *Checker) softErrorf(at positioner, code errorCode, format string, args ...any) {
-	check.err(check.newErrorf(at, code, true, format, args...))
+	err := newErrorf(at, code, format, args...)
+	err.soft = true
+	check.report(err)
 }
 
 func (check *Checker) invalidAST(at positioner, format string, args ...any) {

diff --git a/src/go/types/errors_test.go b/src/go/types/errors_test.go
index 942a9fd..4b5dab6 100644
--- a/src/go/types/errors_test.go
+++ b/src/go/types/errors_test.go

@@ -4,7 +4,30 @@
 
 package types
 
-import "testing"
+import (
+	"go/token"
+	"testing"
+)
+
+func TestError(t *testing.T) {
+	var err error_
+	want := "no error"
+	if got := err.String(); got != want {
+		t.Errorf("empty error: got %q, want %q", got, want)
+	}
+
+	want = "0: foo 42"
+	err.errorf(token.NoPos, "foo %d", 42)
+	if got := err.String(); got != want {
+		t.Errorf("simple error: got %q, want %q", got, want)
+	}
+
+	want = "0: foo 42\n\tbar 43"
+	err.errorf(token.NoPos, "bar %d", 43)
+	if got := err.String(); got != want {
+		t.Errorf("simple error: got %q, want %q", got, want)
+	}
+}
 
 func TestStripAnnotations(t *testing.T) {
 	for _, test := range []struct {

diff --git a/src/go/types/expr.go b/src/go/types/expr.go
index 9771535..70914d5 100644
--- a/src/go/types/expr.go
+++ b/src/go/types/expr.go

@@ -87,7 +87,7 @@
 // overflow checks that the constant x is representable by its type.
 // For untyped constants, it checks that the value doesn't become
 // arbitrarily large.
-func (check *Checker) overflow(x *operand, op token.Token, opPos token.Pos) {
+func (check *Checker) overflow(x *operand, opPos token.Pos) {
 	assert(x.mode == constant_)
 
 	if x.val.Kind() == constant.Unknown {
@@ -115,8 +115,8 @@
 	}
 }
 
-// opName returns the name of an operation, or the empty string.
-// Only operations that might overflow are handled.
+// opName returns the name of the operation if x is an operation
+// that might overflow; otherwise it returns the empty string.
 func opName(e ast.Expr) string {
 	switch e := e.(type) {
 	case *ast.BinaryExpr:
@@ -213,7 +213,7 @@
 		}
 		x.val = constant.UnaryOp(e.Op, x.val, prec)
 		x.expr = e
-		check.overflow(x, e.Op, x.Pos())
+		check.overflow(x, x.Pos())
 		return
 	}
 
@@ -991,7 +991,7 @@
 			if b, _ := e.(*ast.BinaryExpr); b != nil {
 				opPos = b.OpPos
 			}
-			check.overflow(x, op, opPos)
+			check.overflow(x, opPos)
 			return
 		}
 
@@ -1171,7 +1171,7 @@
 		}
 		x.val = constant.BinaryOp(x.val, op, y.val)
 		x.expr = e
-		check.overflow(x, op, opPos)
+		check.overflow(x, opPos)
 		return
 	}
 

diff --git a/src/go/types/sizes.go b/src/go/types/sizes.go
index 494e045..7b67dca 100644
--- a/src/go/types/sizes.go
+++ b/src/go/types/sizes.go

@@ -166,10 +166,11 @@
 // common architecture word sizes and alignments
 var gcArchSizes = map[string]*StdSizes{
 	"386":      {4, 4},
-	"arm":      {4, 4},
-	"arm64":    {8, 8},
 	"amd64":    {8, 8},
 	"amd64p32": {4, 8},
+	"arm":      {4, 4},
+	"arm64":    {8, 8},
+	"loong64":  {8, 8},
 	"mips":     {4, 4},
 	"mipsle":   {4, 4},
 	"mips64":   {8, 8},
@@ -188,7 +189,7 @@
 // The result is nil if a compiler/architecture pair is not known.
 //
 // Supported architectures for compiler "gc":
-// "386", "arm", "arm64", "amd64", "amd64p32", "mips", "mipsle",
+// "386", "amd64", "amd64p32", "arm", "arm64", "loong64", "mips", "mipsle",
 // "mips64", "mips64le", "ppc64", "ppc64le", "riscv64", "s390x", "sparc64", "wasm".
 func SizesFor(compiler, arch string) Sizes {
 	var m map[string]*StdSizes

diff --git a/src/go/types/testdata/check/const0.go b/src/go/types/testdata/check/const0.go
index 3cffdf9..229c2486 100644
--- a/src/go/types/testdata/check/const0.go
+++ b/src/go/types/testdata/check/const0.go

@@ -349,6 +349,25 @@
 	assert(iota == 0)
 })
 
+// issue #52438
+const i1 = iota
+const i2 = iota
+const i3 = iota
+
+func _() {
+	assert(i1 == 0)
+	assert(i2 == 0)
+	assert(i3 == 0)
+
+	const i4 = iota
+	const i5 = iota
+	const i6 = iota
+
+	assert(i4 == 0)
+	assert(i5 == 0)
+	assert(i6 == 0)
+}
+
 // untyped constants must not get arbitrarily large
 const prec = 512 // internal maximum precision for integers
 const maxInt = (1<<(prec/2) - 1) * (1<<(prec/2) + 1) // == 1<<prec - 1

diff --git a/src/go/types/testdata/fixedbugs/issue52401.go b/src/go/types/testdata/fixedbugs/issue52401.go
new file mode 100644
index 0000000..c7efd8c
--- /dev/null
+++ b/src/go/types/testdata/fixedbugs/issue52401.go

@@ -0,0 +1,11 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package p
+
+func _() {
+	const x = 0
+	x /* ERROR cannot assign to x */ += 1
+	x /* ERROR cannot assign to x */ ++
+}

diff --git a/src/hash/maphash/maphash.go b/src/hash/maphash/maphash.go
index 783690e..dfacd02 100644
--- a/src/hash/maphash/maphash.go
+++ b/src/hash/maphash/maphash.go

@@ -252,21 +252,20 @@
 
 // MakeSeed returns a new random seed.
 func MakeSeed() Seed {
-	var s1, s2 uint64
+	var s uint64
 	for {
-		s1 = uint64(runtime_fastrand())
-		s2 = uint64(runtime_fastrand())
+		s = runtime_fastrand64()
 		// We use seed 0 to indicate an uninitialized seed/hash,
 		// so keep trying until we get a non-zero seed.
-		if s1|s2 != 0 {
+		if s != 0 {
 			break
 		}
 	}
-	return Seed{s: s1<<32 + s2}
+	return Seed{s: s}
 }
 
-//go:linkname runtime_fastrand runtime.fastrand
-func runtime_fastrand() uint32
+//go:linkname runtime_fastrand64 runtime.fastrand64
+func runtime_fastrand64() uint64
 
 func rthash(ptr *byte, len int, seed uint64) uint64 {
 	if len == 0 {

diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s
index fc6f170c..cbe0525 100644
--- a/src/internal/bytealg/compare_ppc64x.s
+++ b/src/internal/bytealg/compare_ppc64x.s

@@ -21,11 +21,12 @@
 	CMP     R5,R6,CR7
 	CMP	R3,R4,CR6
 	BEQ	CR7,equal
-#ifdef	GOARCH_ppc64le
-	BR	cmpbodyLE<>(SB)
-#else
-	BR      cmpbodyBE<>(SB)
-#endif
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+	CMP	R16,$1
+	BNE	power8
+	BR	cmpbodyp9<>(SB)
+power8:
+	BR	cmpbody<>(SB)
 equal:
 	BEQ	CR6,done
 	MOVD	$1, R8
@@ -52,11 +53,12 @@
 	CMP     R5,R6,CR7
 	CMP	R3,R4,CR6
 	BEQ	CR7,equal
-#ifdef	GOARCH_ppc64le
-	BR	cmpbodyLE<>(SB)
-#else
-	BR      cmpbodyBE<>(SB)
-#endif
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+	CMP	R16,$1
+	BNE	power8
+	BR	cmpbodyp9<>(SB)
+power8:
+	BR	cmpbody<>(SB)
 equal:
 	BEQ	CR6,done
 	MOVD	$1, R8
@@ -70,209 +72,431 @@
 	MOVD	$0, R3
 	RET
 
-// Do an efficient memcmp for ppc64le
+#ifdef GOARCH_ppc64le
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+GLOBL byteswap<>+0(SB), RODATA, $16
+#define SWAP V21
+#endif
+
+// Do an efficient memcmp for ppc64le/ppc64/POWER8
 // R3 = a len
 // R4 = b len
 // R5 = a addr
 // R6 = b addr
 // On exit:
 // R3 = return value
-TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	R3,R8		// set up length
 	CMP	R3,R4,CR2	// unequal?
-	BC	12,8,setuplen	// BLT CR2
+	BLT	CR2,setuplen	// BLT CR2
 	MOVD	R4,R8		// use R4 for comparison len
 setuplen:
-	MOVD	R8,CTR		// set up loop counter
-	CMP	R8,$8		// only optimize >=8
-	BLT	simplecheck
-	DCBT	(R5)		// cache hint
-	DCBT	(R6)
 	CMP	R8,$32		// optimize >= 32
 	MOVD	R8,R9
-	BLT	setup8a		// 8 byte moves only
-setup32a:
-	SRADCC	$5,R8,R9	// number of 32 byte chunks
-	MOVD	R9,CTR
+	BLT	setup8a		// optimize < 32
+	MOVD	$16,R10		// set offsets to load into vectors
+	CMP	R8,$64
+	BLT	cmp32		// process size 32-63
 
-	// Special processing for 32 bytes or longer.
-	// Loading this way is faster and correct as long as the
-	// doublewords being compared are equal. Once they
-	// are found unequal, reload them in proper byte order
-	// to determine greater or less than.
-loop32a:
-	MOVD	0(R5),R9	// doublewords to compare
-	MOVD	0(R6),R10	// get 4 doublewords
-	MOVD	8(R5),R14
-	MOVD	8(R6),R15
-	CMPU	R9,R10		// bytes equal?
-	MOVD	$0,R16		// set up for cmpne
-	BNE	cmpne		// further compare for LT or GT
-	MOVD	16(R5),R9	// get next pair of doublewords
-	MOVD	16(R6),R10
-	CMPU	R14,R15		// bytes match?
-	MOVD	$8,R16		// set up for cmpne
-	BNE	cmpne		// further compare for LT or GT
-	MOVD	24(R5),R14	// get next pair of doublewords
-	MOVD    24(R6),R15
-	CMPU	R9,R10		// bytes match?
-	MOVD	$16,R16		// set up for cmpne
-	BNE	cmpne		// further compare for LT or GT
-	MOVD	$-8,R16		// for cmpne, R5,R6 already inc by 32
-	ADD	$32,R5		// bump up to next 32
-	ADD	$32,R6
-	CMPU    R14,R15		// bytes match?
-	BC	8,2,loop32a	// br ctr and cr
-	BNE	cmpne
+	DCBT	(R5)		// optimize >= 64
+	DCBT	(R6)		// cache hint
+	MOVD	$32,R11		// set offsets to load into vector
+	MOVD	$48,R12		// set offsets to load into vector
+
+loop64a:// process size 64 and greater
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different	// jump out if its different
+
+	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
+	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R11),V3	// load bytes of A at offset 32 into vector
+	LXVD2X	(R6)(R11),V4	// load bytes of B at offset 32 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R12),V3	// load bytes of A at offset 64 into vector
+	LXVD2X	(R6)(R12),V4	// load bytes of B at offset 64 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$-64,R9,R9	// reduce remaining size by 64
+	ADD	$64,R5,R5	// increment to next 64 bytes of A
+	ADD	$64,R6,R6	// increment to next 64 bytes of B
+	CMPU	R9,$64
+	BGE	loop64a		// loop back to loop64a only if there are >= 64 bytes remaining
+	
+	CMPU	R9,$32
+	BGE	cmp32		// loop to cmp32 if there are 32-64 bytes remaining
+	CMPU	R9,$0
+	BNE	rem		// loop to rem if the remainder is not 0
+
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+cmp32:
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
+	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$-32,R9,R9	// reduce remaining size by 32
+	ADD	$32,R5,R5	// increment to next 32 bytes of A
+	ADD	$32,R6,R6	// increment to next 32 bytes of B
+	CMPU	R9,$0
+	BNE	rem		// loop to rem if the remainder is not 0
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+rem:
+	MOVD	R9,R8
 	ANDCC	$24,R8,R9	// Any 8 byte chunks?
 	BEQ	leftover	// and result is 0
+	BR	setup8a
+
+different:
+#ifdef	GOARCH_ppc64le
+	MOVD	$byteswap<>+00(SB), R16
+	LXVD2X	(R16)(R0),SWAP	// Set up swap string
+
+	VPERM	V3,V3,SWAP,V3
+	VPERM	V4,V4,SWAP,V4
+#endif
+	MFVSRD	VS35,R16	// move upper doublwords of A and B into GPR for comparison
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BEQ	lower
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+lower:
+	VSLDOI	$8,V3,V3,V3	// move lower doublwords of A and B into GPR for comparison
+	MFVSRD	VS35,R16
+	VSLDOI	$8,V4,V4,V4
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
 setup8a:
-	SRADCC	$3,R9,R9	// get the 8 byte count
+	SRADCC	$3,R8,R9	// get the 8 byte count
 	BEQ	leftover	// shifted value is 0
+	CMPU	R8,$8		// optimize 8byte move
+	BEQ	size8
+	CMPU	R8,$16
+	BEQ	size16
 	MOVD	R9,CTR		// loop count for doublewords
 loop8:
-	MOVDBR	(R5+R0),R9	// doublewords to compare
+#ifdef  GOARCH_ppc64le
+	MOVDBR	(R5+R0),R16	// doublewords to compare
 	MOVDBR	(R6+R0),R10	// LE compare order
+#else
+	MOVD	(R5+R0),R16	// doublewords to compare
+	MOVD	(R6+R0),R10	// BE compare order
+#endif
 	ADD	$8,R5
 	ADD	$8,R6
-	CMPU	R9,R10		// match?
+	CMPU	R16,R10		// match?
 	BC	8,2,loop8	// bt ctr <> 0 && cr
 	BGT	greater
 	BLT	less
 leftover:
 	ANDCC	$7,R8,R9	// check for leftover bytes
-	MOVD	R9,CTR		// save the ctr
-	BNE	simple		// leftover bytes
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,8,less
-	BR	greater
+	BEQ	zeroremainder
 simplecheck:
-	CMP	R8,$0		// remaining compare length 0
-	BNE	simple		// do simple compare
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,8,less	// 1st len < 2nd len, result less
-	BR	greater		// 1st len > 2nd len must be greater
-simple:
-	MOVBZ	0(R5), R9	// get byte from 1st operand
-	ADD	$1,R5
-	MOVBZ	0(R6), R10	// get byte from 2nd operand
-	ADD	$1,R6
-	CMPU	R9, R10
-	BC	8,2,simple	// bc ctr <> 0 && cr
-	BGT	greater		// 1st > 2nd
-	BLT	less		// 1st < 2nd
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,9,greater	// 2nd len > 1st len
-	BR	less		// must be less
-cmpne:				// only here is not equal
-	MOVDBR	(R5+R16),R8	// reload in reverse order
-	MOVDBR	(R6+R16),R9
-	CMPU	R8,R9		// compare correct endianness
-	BGT	greater		// here only if NE
-less:
-	MOVD	$-1, R3		// return value if A < B
+	MOVD	R0,R14
+	CMP	R9,$4		// process 4 bytes
+	BLT	halfword
+#ifdef  GOARCH_ppc64le
+	MOVWBR	(R5)(R14),R10
+	MOVWBR	(R6)(R14),R11
+#else
+	MOVWZ	(R5)(R14),R10
+	MOVWZ	(R6)(R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$-4,R9
+	ADD	$4,R14
+	PCALIGN	$16
+
+halfword:
+	CMP	R9,$2		// process 2 bytes
+	BLT	byte
+#ifdef  GOARCH_ppc64le
+	MOVHBR	(R5)(R14),R10
+	MOVHBR	(R6)(R14),R11
+#else
+	MOVHZ	(R5)(R14),R10
+	MOVHZ	(R6)(R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$-2,R9
+	ADD	$2,R14
+	PCALIGN	$16
+byte:
+	CMP	R9,$0		// process 1 byte
+	BEQ	skip
+	MOVBZ	(R5)(R14),R10
+	MOVBZ	(R6)(R14),R11
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	PCALIGN	$16
+skip:
+	BEQ	CR2,equal
+	BGT	CR2,greater
+
+less:	MOVD	$-1,R3		// return value if A < B
 	RET
+size16:
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+zeroremainder:
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+size8:
+#ifdef  GOARCH_ppc64le
+	MOVDBR	(R5+R0),R16	// doublewords to compare
+	MOVDBR	(R6+R0),R10	// LE compare order
+#else
+	MOVD	(R5+R0),R16	// doublewords to compare
+	MOVD	(R6+R0),R10	// BE compare order
+#endif
+	CMPU	R16,R10		// match?
+	BGT	greater
+	BLT	less
+	BGT	CR2,greater	// 2nd len > 1st len
+	BLT	CR2,less	// 2nd len < 1st len
 equal:
 	MOVD	$0, R3		// return value if A == B
 	RET
 greater:
-	MOVD	$1, R3		// return value if A > B
+	MOVD	$1,R3		// return value if A > B
 	RET
 
-// Do an efficient memcmp for ppc64 (BE)
+// Do an efficient memcmp for ppc64le/ppc64/POWER9
 // R3 = a len
 // R4 = b len
 // R5 = a addr
 // R6 = b addr
 // On exit:
 // R3 = return value
-TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
+TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	R3,R8		// set up length
 	CMP	R3,R4,CR2	// unequal?
-	BC	12,8,setuplen	// BLT CR2
+	BLT	CR2,setuplen	// BLT CR2
 	MOVD	R4,R8		// use R4 for comparison len
 setuplen:
-	MOVD	R8,CTR		// set up loop counter
-	CMP	R8,$8		// only optimize >=8
-	BLT	simplecheck
-	DCBT	(R5)		// cache hint
-	DCBT	(R6)
-	CMP	R8,$32		// optimize >= 32
+	CMP	R8,$16		// optimize for size<16
 	MOVD	R8,R9
-	BLT	setup8a		// 8 byte moves only
+	BLT	simplecheck
+	MOVD	$16,R10		// set offsets to load into vectors
+	CMP	R8,$32		// optimize for size 16-31
+	BLT	cmp16
+	CMP	R8,$64
+	BLT	cmp32		// optimize for size 32-63
+	DCBT	(R5)		// optimize for size>=64
+	DCBT	(R6)		// cache hint
 
-setup32a:
-	SRADCC	$5,R8,R9	// number of 32 byte chunks
-	MOVD	R9,CTR
-loop32a:
-	MOVD	0(R5),R9	// doublewords to compare
-	MOVD	0(R6),R10	// get 4 doublewords
-	MOVD	8(R5),R14
-	MOVD	8(R6),R15
-	CMPU	R9,R10		// bytes equal?
-	BLT	less		// found to be less
-	BGT	greater		// found to be greater
-	MOVD	16(R5),R9	// get next pair of doublewords
-	MOVD	16(R6),R10
-	CMPU	R14,R15		// bytes match?
-	BLT	less		// found less
-	BGT	greater		// found greater
-	MOVD	24(R5),R14	// get next pair of doublewords
-	MOVD	24(R6),R15
-	CMPU	R9,R10		// bytes match?
-	BLT	less		// found to be less
-	BGT	greater		// found to be greater
-	ADD	$32,R5		// bump up to next 32
-	ADD	$32,R6
-	CMPU	R14,R15		// bytes match?
-	BC	8,2,loop32a	// br ctr and cr
-	BLT	less		// with BE, byte ordering is
-	BGT	greater		// good for compare
-	ANDCC	$24,R8,R9	// Any 8 byte chunks?
-	BEQ	leftover	// and result is 0
-setup8a:
-	SRADCC	$3,R9,R9	// get the 8 byte count
-	BEQ	leftover	// shifted value is 0
-	MOVD	R9,CTR		// loop count for doublewords
-loop8:
-	MOVD	(R5),R9
-	MOVD	(R6),R10
-	ADD	$8,R5
-	ADD	$8,R6
-	CMPU	R9,R10		// match?
-	BC	8,2,loop8	// bt ctr <> 0 && cr
+	MOVD	$32,R11		// set offsets to load into vector
+	MOVD	$48,R12		// set offsets to load into vector
+
+loop64a:// process size 64 and greater
+	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 0 into vector
+	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 0 into vector
+	VCMPNEBCC	V3,V4,V1	// record comparison into V1
+	BNE	CR6,different	// jump out if its different
+
+	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	LXVB16X	(R11)(R5),V3	// load bytes of A at offset 32 into vector
+	LXVB16X	(R11)(R6),V4	// load bytes of B at offset 32 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	LXVB16X	(R12)(R5),V3	// load bytes of A at offset 48 into vector
+	LXVB16X	(R12)(R6),V4	// load bytes of B at offset 48 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	ADD	$-64,R9,R9	// reduce remaining size by 64
+	ADD	$64,R5,R5	// increment to next 64 bytes of A
+	ADD	$64,R6,R6	// increment to next 64 bytes of B
+	CMPU	R9,$64
+	BGE	loop64a		// loop back to loop64a only if there are >= 64 bytes remaining
+
+	CMPU	R9,$32
+	BGE	cmp32		// loop to cmp32 if there are 32-64 bytes remaining
+	CMPU	R9,$16
+	BGE	cmp16		// loop to cmp16 if there are 16-31 bytes left
+	CMPU	R9,$0
+	BNE	simplecheck	// loop to simplecheck for remaining bytes
+
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+cmp32:
+	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 0 into vector
+	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 0 into vector
+
+	VCMPNEBCC	V3,V4,V1	// record comparison into V1
+	BNE	CR6,different	// jump out if its different
+
+	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	ADD	$-32,R9,R9	// reduce remaining size by 32
+	ADD	$32,R5,R5	// increment to next 32 bytes of A
+	ADD	$32,R6,R6	// increment to next 32 bytes of B
+	CMPU	R9,$16		// loop to cmp16 if there are 16-31 bytes left
+	BGE	cmp16
+	CMPU	R9,$0
+	BNE	simplecheck	// loop to simplecheck for remainder bytes
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+different:
+
+	MFVSRD	VS35,R16	// move upper doublwords of A and B into GPR for comparison
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BEQ	lower
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+lower:
+	MFVSRLD	VS35,R16	// next move lower doublewords of A and B into GPR for comparison
+	MFVSRLD	VS36,R10
+
+	CMPU	R16,R10
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+
+greater:
+	MOVD	$1,R3		// return value if A > B
+	RET
+cmp16:
+	ANDCC	$16,R9,R31
+	BEQ	tail
+
+	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$16,R5
+	ADD	$16,R6
+tail:
+	ANDCC	$15,R9		// Load the last 16 bytes (we know there are at least 32b)
+	BEQ	end
+
+	ADD	R9,R5
+	ADD	R9,R6
+	MOVD	$-16,R10
+
+	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+end:
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if BLT CR2 that is, len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+simplecheck:
+	MOVD	$0,R14		// process 8 bytes
+	CMP	R9,$8
+	BLT	word
+#ifdef  GOARCH_ppc64le
+	MOVDBR	(R5+R14),R10
+	MOVDBR	(R6+R14),R11
+#else
+	MOVD	(R5+R14),R10
+	MOVD	(R6+R14),R11
+#endif
+	CMPU	R10,R11
 	BGT	greater
 	BLT	less
-leftover:
-	ANDCC	$7,R8,R9	// check for leftover bytes
-	MOVD	R9,CTR		// save the ctr
-	BNE	simple		// leftover bytes
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,8,less
-	BR	greater
-simplecheck:
-	CMP	R8,$0		// remaining compare length 0
-	BNE	simple		// do simple compare
-	BC	12,10,equal	// test CR2 for length comparison
-	BC 	12,8,less	// 1st len < 2nd len, result less
-	BR	greater		// same len, must be equal
-simple:
-	MOVBZ	0(R5),R9	// get byte from 1st operand
-	ADD	$1,R5
-	MOVBZ	0(R6),R10	// get byte from 2nd operand
-	ADD	$1,R6
-	CMPU	R9,R10
-	BC	8,2,simple	// bc ctr <> 0 && cr
-	BGT	greater		// 1st > 2nd
-	BLT	less		// 1st < 2nd
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,9,greater	// 2nd len > 1st len
+	ADD	$8,R14
+	ADD	$-8,R9
+	PCALIGN	$16
+word:
+	CMP	R9,$4		// process 4 bytes
+	BLT	halfword
+#ifdef  GOARCH_ppc64le
+	MOVWBR	(R5+R14),R10
+	MOVWBR	(R6+R14),R11
+#else
+	MOVWZ	(R5+R14),R10
+	MOVWZ	(R6+R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$4,R14
+	ADD	$-4,R9
+	PCALIGN	$16
+halfword:
+	CMP	R9,$2		// process 2 bytes
+	BLT	byte
+#ifdef  GOARCH_ppc64le
+	MOVHBR	(R5+R14),R10
+	MOVHBR	(R6+R14),R11
+#else
+	MOVHZ	(R5+R14),R10
+	MOVHZ	(R6+R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$2,R14
+	ADD	$-2,R9
+	PCALIGN	$16
+byte:
+	CMP	R9,$0		// process 1 byte
+	BEQ	skip
+	MOVBZ	(R5+R14),R10
+	MOVBZ	(R6+R14),R11
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	PCALIGN	$16
+skip:
+	BEQ	CR2,equal
+	BGT	CR2,greater
 less:
-	MOVD	$-1, R3		// return value if A < B
+	MOVD	$-1,R3		// return value if A < B
 	RET
 equal:
 	MOVD	$0, R3		// return value if A == B
 	RET
-greater:
-	MOVD	$1, R3		// return value if A > B
-	RET

diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s
index 0dc6251..7d2f8d6 100644
--- a/src/internal/bytealg/compare_riscv64.s
+++ b/src/internal/bytealg/compare_riscv64.s

@@ -5,161 +5,179 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
-	MOV	a_base+0(FP), X5
-	MOV	a_len+8(FP), X6
-	MOV	b_base+24(FP), X7
-	MOV	b_len+32(FP), X8
-	MOV	$ret+48(FP), X9
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	a_base+0(FP), X10
+	MOV	a_len+8(FP), X11
+	MOV	b_base+24(FP), X12
+	MOV	b_len+32(FP), X13
+	MOV	$ret+48(FP), X14
+#else
+	// X10 = a_base
+	// X11 = a_len
+	// X12 = a_cap (unused)
+	// X13 = b_base (want in X12)
+	// X14 = b_len (want in X13)
+	// X15 = b_cap (unused)
+	MOV	X13, X12
+	MOV	X14, X13
+#endif
 	JMP	compare<>(SB)
 
-TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
-	MOV	a_base+0(FP), X5
-	MOV	a_len+8(FP), X6
-	MOV	b_base+16(FP), X7
-	MOV	b_len+24(FP), X8
-	MOV	$ret+32(FP), X9
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	a_base+0(FP), X10
+	MOV	a_len+8(FP), X11
+	MOV	b_base+16(FP), X12
+	MOV	b_len+24(FP), X13
+	MOV	$ret+32(FP), X14
+#endif
+	// X10 = a_base
+	// X11 = a_len
+	// X12 = b_base
+	// X13 = b_len
 	JMP	compare<>(SB)
 
 // On entry:
-// X5 points to start of a
-// X6 length of a
-// X7 points to start of b
-// X8 length of b
-// X9 points to the address to store the return value (-1/0/1)
+// X10 points to start of a
+// X11 length of a
+// X12 points to start of b
+// X13 length of b
+// for non-regabi X14 points to the address to store the return value (-1/0/1)
+// for regabi the return value in X10
 TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
-	BEQ	X5, X7, cmp_len
+	BEQ	X10, X12, cmp_len
 
-	MOV	X6, X10
-	BGE	X8, X10, use_a_len // X10 = min(len(a), len(b))
-	MOV	X8, X10
+	MOV	X11, X5
+	BGE	X13, X5, use_a_len // X5 = min(len(a), len(b))
+	MOV	X13, X5
 use_a_len:
-	BEQZ	X10, cmp_len
+	BEQZ	X5, cmp_len
 
-	MOV	$32, X11
-	BLT	X10, X11, loop4_check
+	MOV	$32, X6
+	BLT	X5, X6, loop4_check
 
 	// Check alignment - if alignment differs we have to do one byte at a time.
-	AND	$3, X5, X12
-	AND	$3, X7, X13
-	BNE	X12, X13, loop4_check
-	BEQZ	X12, loop32_check
+	AND	$3, X10, X7
+	AND	$3, X12, X8
+	BNE	X7, X8, loop4_check
+	BEQZ	X7, loop32_check
 
 	// Check one byte at a time until we reach 8 byte alignment.
-	SUB	X12, X10, X10
+	SUB	X7, X5, X5
 align:
-	ADD	$-1, X12
-	MOVBU	0(X5), X13
-	MOVBU	0(X7), X14
-	BNE	X13, X14, cmp
-	ADD	$1, X5
-	ADD	$1, X7
-	BNEZ	X12, align
+	ADD	$-1, X7
+	MOVBU	0(X10), X8
+	MOVBU	0(X12), X9
+	BNE	X8, X9, cmp
+	ADD	$1, X10
+	ADD	$1, X12
+	BNEZ	X7, align
 
 loop32_check:
-	MOV	$32, X12
-	BLT	X10, X12, loop16_check
+	MOV	$32, X7
+	BLT	X5, X7, loop16_check
 loop32:
-	MOV	0(X5), X15
-	MOV	0(X7), X16
-	MOV	8(X5), X17
-	MOV	8(X7), X18
+	MOV	0(X10), X15
+	MOV	0(X12), X16
+	MOV	8(X10), X17
+	MOV	8(X12), X18
 	BEQ	X15, X16, loop32a
 	JMP	cmp8a
 loop32a:
 	BEQ	X17, X18, loop32b
 	JMP	cmp8b
 loop32b:
-	MOV	16(X5), X15
-	MOV	16(X7), X16
-	MOV	24(X5), X17
-	MOV	24(X7), X18
+	MOV	16(X10), X15
+	MOV	16(X12), X16
+	MOV	24(X10), X17
+	MOV	24(X12), X18
 	BEQ	X15, X16, loop32c
 	JMP	cmp8a
 loop32c:
 	BEQ	X17, X18, loop32d
 	JMP	cmp8b
 loop32d:
-	ADD	$32, X5
-	ADD	$32, X7
-	ADD	$-32, X10
-	BGE	X10, X12, loop32
-	BEQZ	X10, cmp_len
+	ADD	$32, X10
+	ADD	$32, X12
+	ADD	$-32, X5
+	BGE	X5, X7, loop32
+	BEQZ	X5, cmp_len
 
 loop16_check:
-	MOV	$16, X11
-	BLT	X10, X11, loop4_check
+	MOV	$16, X6
+	BLT	X5, X6, loop4_check
 loop16:
-	MOV	0(X5), X15
-	MOV	0(X7), X16
-	MOV	8(X5), X17
-	MOV	8(X7), X18
+	MOV	0(X10), X15
+	MOV	0(X12), X16
+	MOV	8(X10), X17
+	MOV	8(X12), X18
 	BEQ	X15, X16, loop16a
 	JMP	cmp8a
 loop16a:
 	BEQ	X17, X18, loop16b
 	JMP	cmp8b
 loop16b:
-	ADD	$16, X5
-	ADD	$16, X7
-	ADD	$-16, X10
-	BGE	X10, X11, loop16
-	BEQZ	X10, cmp_len
+	ADD	$16, X10
+	ADD	$16, X12
+	ADD	$-16, X5
+	BGE	X5, X6, loop16
+	BEQZ	X5, cmp_len
 
 loop4_check:
-	MOV	$4, X11
-	BLT	X10, X11, loop1
+	MOV	$4, X6
+	BLT	X5, X6, loop1
 loop4:
-	MOVBU	0(X5), X13
-	MOVBU	0(X7), X14
-	MOVBU	1(X5), X15
-	MOVBU	1(X7), X16
-	BEQ	X13, X14, loop4a
-	SLTU	X14, X13, X10
-	SLTU	X13, X14, X11
+	MOVBU	0(X10), X8
+	MOVBU	0(X12), X9
+	MOVBU	1(X10), X15
+	MOVBU	1(X12), X16
+	BEQ	X8, X9, loop4a
+	SLTU	X9, X8, X5
+	SLTU	X8, X9, X6
 	JMP	cmp_ret
 loop4a:
 	BEQ	X15, X16, loop4b
-	SLTU	X16, X15, X10
-	SLTU	X15, X16, X11
+	SLTU	X16, X15, X5
+	SLTU	X15, X16, X6
 	JMP	cmp_ret
 loop4b:
-	MOVBU	2(X5), X21
-	MOVBU	2(X7), X22
-	MOVBU	3(X5), X23
-	MOVBU	3(X7), X24
+	MOVBU	2(X10), X21
+	MOVBU	2(X12), X22
+	MOVBU	3(X10), X23
+	MOVBU	3(X12), X24
 	BEQ	X21, X22, loop4c
-	SLTU	X22, X21, X10
-	SLTU	X21, X22, X11
+	SLTU	X22, X21, X5
+	SLTU	X21, X22, X6
 	JMP	cmp_ret
 loop4c:
 	BEQ	X23, X24, loop4d
-	SLTU	X24, X23, X10
-	SLTU	X23, X24, X11
+	SLTU	X24, X23, X5
+	SLTU	X23, X24, X6
 	JMP	cmp_ret
 loop4d:
-	ADD	$4, X5
-	ADD	$4, X7
-	ADD	$-4, X10
-	BGE	X10, X11, loop4
+	ADD	$4, X10
+	ADD	$4, X12
+	ADD	$-4, X5
+	BGE	X5, X6, loop4
 
 loop1:
-	BEQZ	X10, cmp_len
-	MOVBU	0(X5), X13
-	MOVBU	0(X7), X14
-	BNE	X13, X14, cmp
-	ADD	$1, X5
-	ADD	$1, X7
-	ADD	$-1, X10
+	BEQZ	X5, cmp_len
+	MOVBU	0(X10), X8
+	MOVBU	0(X12), X9
+	BNE	X8, X9, cmp
+	ADD	$1, X10
+	ADD	$1, X12
+	ADD	$-1, X5
 	JMP	loop1
 
 	// Compare 8 bytes of memory in X15/X16 that are known to differ.
 cmp8a:
 	MOV	$0xff, X19
 cmp8a_loop:
-	AND	X15, X19, X13
-	AND	X16, X19, X14
-	BNE	X13, X14, cmp
+	AND	X15, X19, X8
+	AND	X16, X19, X9
+	BNE	X8, X9, cmp
 	SLLI	$8, X19
 	JMP	cmp8a_loop
 
@@ -167,19 +185,21 @@
 cmp8b:
 	MOV	$0xff, X19
 cmp8b_loop:
-	AND	X17, X19, X13
-	AND	X18, X19, X14
-	BNE	X13, X14, cmp
+	AND	X17, X19, X8
+	AND	X18, X19, X9
+	BNE	X8, X9, cmp
 	SLLI	$8, X19
 	JMP	cmp8b_loop
 
 cmp_len:
-	MOV	X6, X13
-	MOV	X8, X14
+	MOV	X11, X8
+	MOV	X13, X9
 cmp:
-	SLTU	X14, X13, X10
-	SLTU	X13, X14, X11
+	SLTU	X9, X8, X5
+	SLTU	X8, X9, X6
 cmp_ret:
-	SUB	X10, X11, X12
-	MOV	X12, (X9)
+	SUB	X5, X6, X10
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	X10, (X14)
+#endif
 	RET

diff --git a/src/internal/bytealg/count_riscv64.s b/src/internal/bytealg/count_riscv64.s
index 3f4eb23..1e081e2 100644
--- a/src/internal/bytealg/count_riscv64.s
+++ b/src/internal/bytealg/count_riscv64.s

@@ -5,40 +5,61 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·Count(SB),NOSPLIT,$0-40
-	MOV	b_base+0(FP), A1
-	MOV	b_len+8(FP), A2
-	MOVBU	c+24(FP), A3	// byte to count
-	MOV	ZERO, A4	// count
-	ADD	A1, A2		// end
+TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	b_base+0(FP), X10
+	MOV	b_len+8(FP), X11
+	MOVBU	c+24(FP), X12	// byte to count
+#else
+	// X10 = b_base
+	// X11 = b_len
+	// X12 = b_cap (unused)
+	// X13 = byte to count (want in X12)
+	MOV	X13, X12
+#endif
+	MOV	ZERO, X14	// count
+	ADD	X10, X11	// end
 
 loop:
-	BEQ	A1, A2, done
-	MOVBU	(A1), A5
-	ADD	$1, A1
-	BNE	A3, A5, loop
-	ADD	$1, A4
+	BEQ	X10, X11, done
+	MOVBU	(X10), X15
+	ADD	$1, X10
+	BNE	X12, X15, loop
+	ADD	$1, X14
 	JMP	loop
 
 done:
-	MOV	A4, ret+32(FP)
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	X14, ret+32(FP)
+#else
+	MOV	X14, X10
+#endif
 	RET
 
-TEXT ·CountString(SB),NOSPLIT,$0-32
-	MOV	s_base+0(FP), A1
-	MOV	s_len+8(FP), A2
-	MOVBU	c+16(FP), A3	// byte to count
-	MOV	ZERO, A4	// count
-	ADD	A1, A2		// end
+TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	s_base+0(FP), X10
+	MOV	s_len+8(FP), X11
+	MOVBU	c+16(FP), X12	// byte to count
+#endif
+	// X10 = s_base
+	// X11 = s_len
+	// X12 = byte to count
+	MOV	ZERO, X14	// count
+	ADD	X10, X11	// end
 
 loop:
-	BEQ	A1, A2, done
-	MOVBU	(A1), A5
-	ADD	$1, A1
-	BNE	A3, A5, loop
-	ADD	$1, A4
+	BEQ	X10, X11, done
+	MOVBU	(X10), X15
+	ADD	$1, X10
+	BNE	X12, X15, loop
+	ADD	$1, X14
 	JMP	loop
 
 done:
-	MOV	A4, ret+24(FP)
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	X14, ret+24(FP)
+#else
+	MOV	X14, X10
+#endif
 	RET

diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s
index 5dd13be..77202d6 100644
--- a/src/internal/bytealg/equal_riscv64.s
+++ b/src/internal/bytealg/equal_riscv64.s

@@ -8,120 +8,137 @@
 #define	CTXT	S10
 
 // func memequal(a, b unsafe.Pointer, size uintptr) bool
-TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
-	MOV	a+0(FP), X5
-	MOV	b+8(FP), X6
-	MOV	size+16(FP), X7
-	MOV	$ret+24(FP), X19
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	a+0(FP), X10
+	MOV	b+8(FP), X11
+	MOV	size+16(FP), X12
+	MOV	$ret+24(FP), X13
+#endif
+	// X10 = a_base
+	// X11 = b_base
+	// X12 = size
 	JMP	memequal<>(SB)
 
 // func memequal_varlen(a, b unsafe.Pointer) bool
-TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
-	MOV	a+0(FP), X5
-	MOV	b+8(FP), X6
-	MOV	8(CTXT), X7    // compiler stores size at offset 8 in the closure
-	MOV	$ret+16(FP), X19
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
+	MOV	8(CTXT), X12    // compiler stores size at offset 8 in the closure
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	a+0(FP), X10
+	MOV	b+8(FP), X11
+	MOV	$ret+16(FP), X13
+#endif
+	// X10 = a_base
+	// X11 = b_base
 	JMP	memequal<>(SB)
 
-// On entry X5 and X6 contain pointers, X7 contains length.
-// X19 contains address for return value.
+// On entry X10 and X11 contain pointers, X12 contains length.
+// For non-regabi X13 contains address for return value.
+// For regabi return value in X10.
 TEXT memequal<>(SB),NOSPLIT|NOFRAME,$0
-	BEQ	X5, X6, eq
+	BEQ	X10, X11, eq
 
-	MOV	$32, X8
-	BLT	X7, X8, loop4_check
+	MOV	$32, X23
+	BLT	X12, X23, loop4_check
 
 	// Check alignment - if alignment differs we have to do one byte at a time.
-	AND	$3, X5, X9
-	AND	$3, X6, X10
-	BNE	X9, X10, loop4_check
+	AND	$3, X10, X9
+	AND	$3, X11, X19
+	BNE	X9, X19, loop4_check
 	BEQZ	X9, loop32_check
 
 	// Check one byte at a time until we reach 8 byte alignment.
-	SUB	X9, X7, X7
+	SUB	X9, X12, X12
 align:
 	ADD	$-1, X9
-	MOVBU	0(X5), X10
-	MOVBU	0(X6), X11
-	BNE	X10, X11, not_eq
-	ADD	$1, X5
-	ADD	$1, X6
+	MOVBU	0(X10), X19
+	MOVBU	0(X11), X20
+	BNE	X19, X20, not_eq
+	ADD	$1, X10
+	ADD	$1, X11
 	BNEZ	X9, align
 
 loop32_check:
 	MOV	$32, X9
-	BLT	X7, X9, loop16_check
+	BLT	X12, X9, loop16_check
 loop32:
-	MOV	0(X5), X10
-	MOV	0(X6), X11
-	MOV	8(X5), X12
-	MOV	8(X6), X13
-	BNE	X10, X11, not_eq
-	BNE	X12, X13, not_eq
-	MOV	16(X5), X14
-	MOV	16(X6), X15
-	MOV	24(X5), X16
-	MOV	24(X6), X17
+	MOV	0(X10), X19
+	MOV	0(X11), X20
+	MOV	8(X10), X21
+	MOV	8(X11), X22
+	BNE	X19, X20, not_eq
+	BNE	X21, X22, not_eq
+	MOV	16(X10), X14
+	MOV	16(X11), X15
+	MOV	24(X10), X16
+	MOV	24(X11), X17
 	BNE	X14, X15, not_eq
 	BNE	X16, X17, not_eq
-	ADD	$32, X5
-	ADD	$32, X6
-	ADD	$-32, X7
-	BGE	X7, X9, loop32
-	BEQZ	X7, eq
+	ADD	$32, X10
+	ADD	$32, X11
+	ADD	$-32, X12
+	BGE	X12, X9, loop32
+	BEQZ	X12, eq
 
 loop16_check:
-	MOV	$16, X8
-	BLT	X7, X8, loop4_check
+	MOV	$16, X23
+	BLT	X12, X23, loop4_check
 loop16:
-	MOV	0(X5), X10
-	MOV	0(X6), X11
-	MOV	8(X5), X12
-	MOV	8(X6), X13
-	BNE	X10, X11, not_eq
-	BNE	X12, X13, not_eq
-	ADD	$16, X5
-	ADD	$16, X6
-	ADD	$-16, X7
-	BGE	X7, X8, loop16
-	BEQZ	X7, eq
+	MOV	0(X10), X19
+	MOV	0(X11), X20
+	MOV	8(X10), X21
+	MOV	8(X11), X22
+	BNE	X19, X20, not_eq
+	BNE	X21, X22, not_eq
+	ADD	$16, X10
+	ADD	$16, X11
+	ADD	$-16, X12
+	BGE	X12, X23, loop16
+	BEQZ	X12, eq
 
 loop4_check:
-	MOV	$4, X8
-	BLT	X7, X8, loop1
+	MOV	$4, X23
+	BLT	X12, X23, loop1
 loop4:
-	MOVBU	0(X5), X10
-	MOVBU	0(X6), X11
-	MOVBU	1(X5), X12
-	MOVBU	1(X6), X13
-	BNE	X10, X11, not_eq
-	BNE	X12, X13, not_eq
-	MOVBU	2(X5), X14
-	MOVBU	2(X6), X15
-	MOVBU	3(X5), X16
-	MOVBU	3(X6), X17
+	MOVBU	0(X10), X19
+	MOVBU	0(X11), X20
+	MOVBU	1(X10), X21
+	MOVBU	1(X11), X22
+	BNE	X19, X20, not_eq
+	BNE	X21, X22, not_eq
+	MOVBU	2(X10), X14
+	MOVBU	2(X11), X15
+	MOVBU	3(X10), X16
+	MOVBU	3(X11), X17
 	BNE	X14, X15, not_eq
 	BNE	X16, X17, not_eq
-	ADD	$4, X5
-	ADD	$4, X6
-	ADD	$-4, X7
-	BGE	X7, X8, loop4
+	ADD	$4, X10
+	ADD	$4, X11
+	ADD	$-4, X12
+	BGE	X12, X23, loop4
 
 loop1:
-	BEQZ	X7, eq
-	MOVBU	0(X5), X10
-	MOVBU	0(X6), X11
-	BNE	X10, X11, not_eq
-	ADD	$1, X5
-	ADD	$1, X6
-	ADD	$-1, X7
+	BEQZ	X12, eq
+	MOVBU	0(X10), X19
+	MOVBU	0(X11), X20
+	BNE	X19, X20, not_eq
+	ADD	$1, X10
+	ADD	$1, X11
+	ADD	$-1, X12
 	JMP	loop1
 
 not_eq:
-	MOV	$0, X5
-	MOVB	X5, (X19)
+#ifndef GOEXPERIMENT_regabiargs
+	MOVB	ZERO, (X13)
+#else
+	MOVB	ZERO, X10
+#endif
 	RET
 eq:
-	MOV	$1, X5
-	MOVB	X5, (X19)
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	$1, X10
+	MOVB	X10, (X13)
+#else
+	MOV	$1, X10
+#endif
 	RET

diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s
index 156c303..e9d3e6b 100644
--- a/src/internal/bytealg/indexbyte_riscv64.s
+++ b/src/internal/bytealg/indexbyte_riscv64.s

@@ -5,48 +5,67 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·IndexByte(SB),NOSPLIT,$0-40
-	MOV	b_base+0(FP), A1
-	MOV	b_len+8(FP), A2
-	MOVBU	c+24(FP), A3	// byte to find
-	MOV	A1, A4		// store base for later
-	ADD	A1, A2		// end
-	ADD	$-1, A1
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	b_base+0(FP), X10
+	MOV	b_len+8(FP), X11
+	MOVBU	c+24(FP), X13	// byte to find
+#endif
+	// X10 = b_base
+	// X11 = b_len
+	// X12 = b_cap (unused)
+	// X13 = byte to find
+	MOV	X10, X12		// store base for later
+	ADD	X10, X11		// end
+	ADD	$-1, X10
 
 loop:
-	ADD	$1, A1
-	BEQ	A1, A2, notfound
-	MOVBU	(A1), A5
-	BNE	A3, A5, loop
+	ADD	$1, X10
+	BEQ	X10, X11, notfound
+	MOVBU	(X10), X14
+	BNE	X13, X14, loop
 
-	SUB	A4, A1		// remove base
-	MOV	A1, ret+32(FP)
+	SUB	X12, X10		// remove base
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	X10, ret+32(FP)
+#endif
 	RET
 
 notfound:
-	MOV	$-1, A1
-	MOV	A1, ret+32(FP)
+	MOV	$-1, X10
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	X10, ret+32(FP)
+#endif
 	RET
 
-TEXT ·IndexByteString(SB),NOSPLIT,$0-32
-	MOV	s_base+0(FP), A1
-	MOV	s_len+8(FP), A2
-	MOVBU	c+16(FP), A3	// byte to find
-	MOV	A1, A4		// store base for later
-	ADD	A1, A2		// end
-	ADD	$-1, A1
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	s_base+0(FP), X10
+	MOV	s_len+8(FP), X11
+	MOVBU	c+16(FP), X12	// byte to find
+#endif
+	// X10 = b_base
+	// X11 = b_len
+	// X12 = byte to find
+	MOV	X10, X13		// store base for later
+	ADD	X10, X11		// end
+	ADD	$-1, X10
 
 loop:
-	ADD	$1, A1
-	BEQ	A1, A2, notfound
-	MOVBU	(A1), A5
-	BNE	A3, A5, loop
+	ADD	$1, X10
+	BEQ	X10, X11, notfound
+	MOVBU	(X10), X14
+	BNE	X12, X14, loop
 
-	SUB	A4, A1		// remove base
-	MOV	A1, ret+24(FP)
+	SUB	X13, X10		// remove base
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	X10, ret+24(FP)
+#endif
 	RET
 
 notfound:
-	MOV	$-1, A1
-	MOV	A1, ret+24(FP)
+	MOV	$-1, X10
+#ifndef GOEXPERIMENT_regabiargs
+	MOV	X10, ret+24(FP)
+#endif
 	RET

diff --git a/src/net/dial_test.go b/src/net/dial_test.go
index b9aead0..3cce444 100644
--- a/src/net/dial_test.go
+++ b/src/net/dial_test.go

@@ -758,6 +758,12 @@
 func TestDialCancel(t *testing.T) {
 	mustHaveExternalNetwork(t)
 
+	if strings.HasPrefix(testenv.Builder(), "darwin-arm64") {
+		// The darwin-arm64 machines run in an environment that's not
+		// compatible with this test.
+		t.Skipf("builder %q gives no route to host for 198.18.0.0", testenv.Builder())
+	}
+
 	blackholeIPPort := JoinHostPort(slowDst4, "1234")
 	if !supportsIPv4() {
 		blackholeIPPort = JoinHostPort(slowDst6, "1234")

diff --git a/src/net/dnsclient.go b/src/net/dnsclient.go
index a779c37..b609dbd 100644
--- a/src/net/dnsclient.go
+++ b/src/net/dnsclient.go

@@ -13,13 +13,10 @@
 )
 
 // provided by runtime
-func fastrand() uint32
+func fastrandu() uint
 
 func randInt() int {
-	x, y := fastrand(), fastrand()    // 32-bit halves
-	u := uint(x)<<31 ^ uint(int32(y)) // full uint, even on 64-bit systems; avoid 32-bit shift on 32-bit systems
-	i := int(u >> 1)                  // clear sign bit, even on 32-bit systems
-	return i
+	return int(fastrandu() >> 1) // clear sign bit
 }
 
 func randIntn(n int) int {

diff --git a/src/net/http/export_test.go b/src/net/http/export_test.go
index a849327..205ca83 100644
--- a/src/net/http/export_test.go
+++ b/src/net/http/export_test.go

@@ -306,3 +306,12 @@
 	}
 	tr.idleMu.Unlock()
 }
+
+// ResponseWriterConnForTesting returns w's underlying connection, if w
+// is a regular *response ResponseWriter.
+func ResponseWriterConnForTesting(w ResponseWriter) (c net.Conn, ok bool) {
+	if r, ok := w.(*response); ok {
+		return r.conn.rwc, true
+	}
+	return nil, false
+}

diff --git a/src/net/http/request.go b/src/net/http/request.go
index 3122119..d091f3c 100644
--- a/src/net/http/request.go
+++ b/src/net/http/request.go

@@ -1126,21 +1126,34 @@
 // MaxBytesReader is similar to io.LimitReader but is intended for
 // limiting the size of incoming request bodies. In contrast to
 // io.LimitReader, MaxBytesReader's result is a ReadCloser, returns a
-// non-EOF error for a Read beyond the limit, and closes the
+// MaxBytesError for a Read beyond the limit, and closes the
 // underlying reader when its Close method is called.
 //
 // MaxBytesReader prevents clients from accidentally or maliciously
-// sending a large request and wasting server resources.
+// sending a large request and wasting server resources. If possible,
+// it tells the ResponseWriter to close the connection after the limit
+// has been reached.
 func MaxBytesReader(w ResponseWriter, r io.ReadCloser, n int64) io.ReadCloser {
 	if n < 0 { // Treat negative limits as equivalent to 0.
 		n = 0
 	}
-	return &maxBytesReader{w: w, r: r, n: n}
+	return &maxBytesReader{w: w, r: r, i: n, n: n}
+}
+
+// MaxBytesError is returned by MaxBytesReader when its read limit is exceeded.
+type MaxBytesError struct {
+	Limit int64
+}
+
+func (e *MaxBytesError) Error() string {
+	// Due to Hyrum's law, this text cannot be changed.
+	return "http: request body too large"
 }
 
 type maxBytesReader struct {
 	w   ResponseWriter
 	r   io.ReadCloser // underlying reader
+	i   int64         // max bytes initially, for MaxBytesError
 	n   int64         // max bytes remaining
 	err error         // sticky error
 }
@@ -1182,7 +1195,7 @@
 	if res, ok := l.w.(requestTooLarger); ok {
 		res.requestTooLarge()
 	}
-	l.err = errors.New("http: request body too large")
+	l.err = &MaxBytesError{l.i}
 	return n, l.err
 }
 

diff --git a/src/net/http/serve_test.go b/src/net/http/serve_test.go
index 1c85a66..404cca0 100644
--- a/src/net/http/serve_test.go
+++ b/src/net/http/serve_test.go

@@ -3035,6 +3035,13 @@
 		if n != limit {
 			t.Errorf("io.Copy = %d, want %d", n, limit)
 		}
+		mbErr, ok := err.(*MaxBytesError)
+		if !ok {
+			t.Errorf("expected MaxBytesError, got %T", err)
+		}
+		if mbErr.Limit != limit {
+			t.Errorf("MaxBytesError.Limit = %d, want %d", mbErr.Limit, limit)
+		}
 	}))
 	defer cst.close()
 

diff --git a/src/net/http/sniff.go b/src/net/http/sniff.go
index 67a7151..ac18ab9 100644
--- a/src/net/http/sniff.go
+++ b/src/net/http/sniff.go

@@ -129,11 +129,6 @@
 	// Enforce the pattern match ordering as prescribed in
 	// https://mimesniff.spec.whatwg.org/#matching-an-audio-or-video-type-pattern
 	&maskedSig{
-		mask: []byte("\xFF\xFF\xFF\xFF"),
-		pat:  []byte(".snd"),
-		ct:   "audio/basic",
-	},
-	&maskedSig{
 		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
 		pat:  []byte("FORM\x00\x00\x00\x00AIFF"),
 		ct:   "audio/aiff",

diff --git a/src/net/http/transport_test.go b/src/net/http/transport_test.go
index 6fcb458..acdfae3 100644
--- a/src/net/http/transport_test.go
+++ b/src/net/http/transport_test.go

@@ -57,6 +57,12 @@
 	}
 	w.Header().Set("X-Saw-Close", fmt.Sprint(r.Close))
 	w.Write([]byte(r.RemoteAddr))
+
+	// Include the address of the net.Conn in addition to the RemoteAddr,
+	// in case kernels reuse source ports quickly (see Issue 52450)
+	if c, ok := ResponseWriterConnForTesting(w); ok {
+		fmt.Fprintf(w, ", %T %p", c, c)
+	}
 })
 
 // testCloseConn is a net.Conn tracked by a testConnSet.
@@ -240,6 +246,12 @@
 	connSet.check(t)
 }
 
+// TestTransportConnectionCloseOnRequest tests that the Transport's doesn't reuse
+// an underlying TCP connection after making an http.Request with Request.Close set.
+//
+// It tests the behavior by making an HTTP request to a server which
+// describes the source source connection it got (remote port number +
+// address of its net.Conn).
 func TestTransportConnectionCloseOnRequest(t *testing.T) {
 	defer afterTest(t)
 	ts := httptest.NewServer(hostPortHandler)
@@ -250,7 +262,7 @@
 	c := ts.Client()
 	tr := c.Transport.(*Transport)
 	tr.Dial = testDial
-	for _, connectionClose := range []bool{false, true} {
+	for _, reqClose := range []bool{false, true} {
 		fetch := func(n int) string {
 			req := new(Request)
 			var err error
@@ -262,29 +274,37 @@
 			req.Proto = "HTTP/1.1"
 			req.ProtoMajor = 1
 			req.ProtoMinor = 1
-			req.Close = connectionClose
+			req.Close = reqClose
 
 			res, err := c.Do(req)
 			if err != nil {
-				t.Fatalf("error in connectionClose=%v, req #%d, Do: %v", connectionClose, n, err)
+				t.Fatalf("error in Request.Close=%v, req #%d, Do: %v", reqClose, n, err)
 			}
-			if got, want := res.Header.Get("X-Saw-Close"), fmt.Sprint(connectionClose); got != want {
-				t.Errorf("For connectionClose = %v; handler's X-Saw-Close was %v; want %v",
-					connectionClose, got, !connectionClose)
+			if got, want := res.Header.Get("X-Saw-Close"), fmt.Sprint(reqClose); got != want {
+				t.Errorf("for Request.Close = %v; handler's X-Saw-Close was %v; want %v",
+					reqClose, got, !reqClose)
 			}
 			body, err := io.ReadAll(res.Body)
 			if err != nil {
-				t.Fatalf("error in connectionClose=%v, req #%d, ReadAll: %v", connectionClose, n, err)
+				t.Fatalf("for Request.Close=%v, on request %v/2: ReadAll: %v", reqClose, n, err)
 			}
 			return string(body)
 		}
 
 		body1 := fetch(1)
 		body2 := fetch(2)
-		bodiesDiffer := body1 != body2
-		if bodiesDiffer != connectionClose {
-			t.Errorf("error in connectionClose=%v. unexpected bodiesDiffer=%v; body1=%q; body2=%q",
-				connectionClose, bodiesDiffer, body1, body2)
+
+		got := 1
+		if body1 != body2 {
+			got++
+		}
+		want := 1
+		if reqClose {
+			want = 2
+		}
+		if got != want {
+			t.Errorf("for Request.Close=%v: server saw %v unique connections, wanted %v\n\nbodies were: %q and %q",
+				reqClose, got, want, body1, body2)
 		}
 
 		tr.CloseIdleConnections()

diff --git a/src/net/net_test.go b/src/net/net_test.go
index fa5ad63..29a2c5d 100644
--- a/src/net/net_test.go
+++ b/src/net/net_test.go

@@ -529,17 +529,19 @@
 		<-dialed
 		cs.(*TCPConn).SetLinger(0)
 		cs.Close()
-
-		ln.Close()
 	}()
-	defer func() { <-serverDone }()
+	defer func() {
+		ln.Close()
+		<-serverDone
+	}()
 
 	ss, err := Dial("tcp", ln.Addr().String())
+	close(dialed)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer ss.Close()
-	close(dialed)
+
 	_, err = ss.Read([]byte{0})
 	if err == nil {
 		t.Fatal("Read succeeded unexpectedly")

diff --git a/src/net/netip/netip.go b/src/net/netip/netip.go
index 8fad25d..7d8b203 100644
--- a/src/net/netip/netip.go
+++ b/src/net/netip/netip.go

@@ -456,7 +456,7 @@
 
 // Is4 reports whether ip is an IPv4 address.
 //
-// It returns false for IPv4-mapped IPv6 addresses. See IP.Unmap.
+// It returns false for IPv4-mapped IPv6 addresses. See Addr.Unmap.
 func (ip Addr) Is4() bool {
 	return ip.z == z4
 }

diff --git a/src/os/exec/env_test.go b/src/os/exec/env_test.go
index b5ac398..112f1e6 100644
--- a/src/os/exec/env_test.go
+++ b/src/os/exec/env_test.go

@@ -18,17 +18,29 @@
 		{
 			noCase: true,
 			in:     []string{"k1=v1", "k2=v2", "K1=v3"},
-			want:   []string{"K1=v3", "k2=v2"},
+			want:   []string{"k2=v2", "K1=v3"},
 		},
 		{
 			noCase: false,
 			in:     []string{"k1=v1", "K1=V2", "k1=v3"},
-			want:   []string{"k1=v3", "K1=V2"},
+			want:   []string{"K1=V2", "k1=v3"},
 		},
 		{
 			in:   []string{"=a", "=b", "foo", "bar"},
 			want: []string{"=b", "foo", "bar"},
 		},
+		{
+			// #49886: preserve weird Windows keys with leading "=" signs.
+			noCase: true,
+			in:     []string{`=C:=C:\golang`, `=D:=D:\tmp`, `=D:=D:\`},
+			want:   []string{`=C:=C:\golang`, `=D:=D:\`},
+		},
+		{
+			// #52436: preserve invalid key-value entries (for now).
+			// (Maybe filter them out or error out on them at some point.)
+			in:   []string{"dodgy", "entries"},
+			want: []string{"dodgy", "entries"},
+		},
 	}
 	for _, tt := range tests {
 		got := dedupEnvCase(tt.noCase, tt.in)

diff --git a/src/os/exec/example_test.go b/src/os/exec/example_test.go
index a66890b..bb166ce 100644
--- a/src/os/exec/example_test.go
+++ b/src/os/exec/example_test.go

@@ -144,6 +144,21 @@
 	fmt.Printf("%s\n", stdoutStderr)
 }
 
+func ExampleCmd_Environ() {
+	cmd := exec.Command("pwd")
+
+	// Set Dir before calling cmd.Environ so that it will include an
+	// updated PWD variable (on platforms where that is used).
+	cmd.Dir = ".."
+	cmd.Env = append(cmd.Environ(), "POSIXLY_CORRECT=1")
+
+	out, err := cmd.Output()
+	if err != nil {
+		log.Fatal(err)
+	}
+	fmt.Printf("%s\n", out)
+}
+
 func ExampleCommandContext() {
 	ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer cancel()

diff --git a/src/os/exec/exec.go b/src/os/exec/exec.go
index 845b737..91c2e00 100644
--- a/src/os/exec/exec.go
+++ b/src/os/exec/exec.go

@@ -223,13 +223,6 @@
 	return a == b
 }
 
-func (c *Cmd) envv() ([]string, error) {
-	if c.Env != nil {
-		return c.Env, nil
-	}
-	return execenv.Default(c.SysProcAttr)
-}
-
 func (c *Cmd) argv() []string {
 	if len(c.Args) > 0 {
 		return c.Args
@@ -237,10 +230,6 @@
 	return []string{c.Path}
 }
 
-// skipStdinCopyError optionally specifies a function which reports
-// whether the provided stdin copy error should be ignored.
-var skipStdinCopyError func(error) bool
-
 func (c *Cmd) stdin() (f *os.File, err error) {
 	if c.Stdin == nil {
 		f, err = os.Open(os.DevNull)
@@ -264,7 +253,7 @@
 	c.closeAfterWait = append(c.closeAfterWait, pw)
 	c.goroutine = append(c.goroutine, func() error {
 		_, err := io.Copy(pw, c.Stdin)
-		if skip := skipStdinCopyError; skip != nil && skip(err) {
+		if skipStdinCopyError(err) {
 			err = nil
 		}
 		if err1 := pw.Close(); err == nil {
@@ -414,7 +403,7 @@
 	}
 	c.childFiles = append(c.childFiles, c.ExtraFiles...)
 
-	envv, err := c.envv()
+	env, err := c.environ()
 	if err != nil {
 		return err
 	}
@@ -422,7 +411,7 @@
 	c.Process, err = os.StartProcess(c.Path, c.argv(), &os.ProcAttr{
 		Dir:   c.Dir,
 		Files: c.childFiles,
-		Env:   addCriticalEnv(dedupEnv(envv)),
+		Env:   env,
 		Sys:   c.SysProcAttr,
 	})
 	if err != nil {
@@ -735,6 +724,54 @@
 	return b
 }
 
+// environ returns a best-effort copy of the environment in which the command
+// would be run as it is currently configured. If an error occurs in computing
+// the environment, it is returned alongside the best-effort copy.
+func (c *Cmd) environ() ([]string, error) {
+	var err error
+
+	env := c.Env
+	if env == nil {
+		env, err = execenv.Default(c.SysProcAttr)
+		if err != nil {
+			env = os.Environ()
+			// Note that the non-nil err is preserved despite env being overridden.
+		}
+
+		if c.Dir != "" {
+			switch runtime.GOOS {
+			case "windows", "plan9":
+				// Windows and Plan 9 do not use the PWD variable, so we don't need to
+				// keep it accurate.
+			default:
+				// On POSIX platforms, PWD represents “an absolute pathname of the
+				// current working directory.” Since we are changing the working
+				// directory for the command, we should also update PWD to reflect that.
+				//
+				// Unfortunately, we didn't always do that, so (as proposed in
+				// https://go.dev/issue/50599) to avoid unintended collateral damage we
+				// only implicitly update PWD when Env is nil. That way, we're much
+				// less likely to override an intentional change to the variable.
+				if pwd, absErr := filepath.Abs(c.Dir); absErr == nil {
+					env = append(env, "PWD="+pwd)
+				} else if err == nil {
+					err = absErr
+				}
+			}
+		}
+	}
+
+	return addCriticalEnv(dedupEnv(env)), err
+}
+
+// Environ returns a copy of the environment in which the command would be run
+// as it is currently configured.
+func (c *Cmd) Environ() []string {
+	//  Intentionally ignore errors: environ returns a best-effort environment no matter what.
+	env, _ := c.environ()
+	return env
+}
+
 // dedupEnv returns a copy of env with any duplicates removed, in favor of
 // later values.
 // Items not of the normal environment "key=value" form are preserved unchanged.
@@ -745,24 +782,47 @@
 // dedupEnvCase is dedupEnv with a case option for testing.
 // If caseInsensitive is true, the case of keys is ignored.
 func dedupEnvCase(caseInsensitive bool, env []string) []string {
+	// Construct the output in reverse order, to preserve the
+	// last occurrence of each key.
 	out := make([]string, 0, len(env))
-	saw := make(map[string]int, len(env)) // key => index into out
-	for _, kv := range env {
-		k, _, ok := strings.Cut(kv, "=")
-		if !ok {
-			out = append(out, kv)
+	saw := make(map[string]bool, len(env))
+	for n := len(env); n > 0; n-- {
+		kv := env[n-1]
+
+		i := strings.Index(kv, "=")
+		if i == 0 {
+			// We observe in practice keys with a single leading "=" on Windows.
+			// TODO(#49886): Should we consume only the first leading "=" as part
+			// of the key, or parse through arbitrarily many of them until a non-"="?
+			i = strings.Index(kv[1:], "=") + 1
+		}
+		if i < 0 {
+			if kv != "" {
+				// The entry is not of the form "key=value" (as it is required to be).
+				// Leave it as-is for now.
+				// TODO(#52436): should we strip or reject these bogus entries?
+				out = append(out, kv)
+			}
 			continue
 		}
+		k := kv[:i]
 		if caseInsensitive {
 			k = strings.ToLower(k)
 		}
-		if dupIdx, isDup := saw[k]; isDup {
-			out[dupIdx] = kv
+		if saw[k] {
 			continue
 		}
-		saw[k] = len(out)
+
+		saw[k] = true
 		out = append(out, kv)
 	}
+
+	// Now reverse the slice to restore the original order.
+	for i := 0; i < len(out)/2; i++ {
+		j := len(out) - i - 1
+		out[i], out[j] = out[j], out[i]
+	}
+
 	return out
 }
 

diff --git a/src/os/exec/exec_linux_test.go b/src/os/exec/exec_linux_test.go
index 4a37c96..b9f6b7b 100644
--- a/src/os/exec/exec_linux_test.go
+++ b/src/os/exec/exec_linux_test.go

@@ -22,7 +22,7 @@
 )
 
 func init() {
-	if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
+	if os.Getenv("GO_EXEC_TEST_PID") == "" {
 		return
 	}
 

diff --git a/src/os/exec/exec_plan9.go b/src/os/exec/exec_plan9.go
index 21ac7b7..8920bec 100644
--- a/src/os/exec/exec_plan9.go
+++ b/src/os/exec/exec_plan9.go

@@ -6,14 +6,14 @@
 
 import "io/fs"
 
-func init() {
-	skipStdinCopyError = func(err error) bool {
-		// Ignore hungup errors copying to stdin if the program
-		// completed successfully otherwise.
-		// See Issue 35753.
-		pe, ok := err.(*fs.PathError)
-		return ok &&
-			pe.Op == "write" && pe.Path == "|1" &&
-			pe.Err.Error() == "i/o on hungup channel"
-	}
+// skipStdinCopyError optionally specifies a function which reports
+// whether the provided stdin copy error should be ignored.
+func skipStdinCopyError(err error) bool {
+	// Ignore hungup errors copying to stdin if the program
+	// completed successfully otherwise.
+	// See Issue 35753.
+	pe, ok := err.(*fs.PathError)
+	return ok &&
+		pe.Op == "write" && pe.Path == "|1" &&
+		pe.Err.Error() == "i/o on hungup channel"
 }

diff --git a/src/os/exec/exec_posix_test.go b/src/os/exec/exec_posix_test.go
index ce83a9e..f040137 100644
--- a/src/os/exec/exec_posix_test.go
+++ b/src/os/exec/exec_posix_test.go

@@ -7,16 +7,46 @@
 package exec_test
 
 import (
+	"fmt"
+	"internal/testenv"
+	"os"
 	"os/user"
+	"path/filepath"
+	"reflect"
 	"runtime"
 	"strconv"
+	"strings"
 	"syscall"
 	"testing"
 	"time"
 )
 
+func init() {
+	registerHelperCommand("pwd", cmdPwd)
+	registerHelperCommand("sleep", cmdSleep)
+}
+
+func cmdPwd(...string) {
+	pwd, err := os.Getwd()
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(1)
+	}
+	fmt.Println(pwd)
+}
+
+func cmdSleep(args ...string) {
+	n, err := strconv.Atoi(args[0])
+	if err != nil {
+		fmt.Println(err)
+		os.Exit(1)
+	}
+	time.Sleep(time.Duration(n) * time.Second)
+}
+
 func TestCredentialNoSetGroups(t *testing.T) {
 	if runtime.GOOS == "android" {
+		maySkipHelperCommand("echo")
 		t.Skip("unsupported on Android")
 	}
 
@@ -55,7 +85,7 @@
 func TestWaitid(t *testing.T) {
 	t.Parallel()
 
-	cmd := helperCommand(t, "sleep")
+	cmd := helperCommand(t, "sleep", "3")
 	if err := cmd.Start(); err != nil {
 		t.Fatal(err)
 	}
@@ -86,3 +116,139 @@
 
 	<-ch
 }
+
+// https://go.dev/issue/50599: if Env is not set explicitly, setting Dir should
+// implicitly update PWD to the correct path, and Environ should list the
+// updated value.
+func TestImplicitPWD(t *testing.T) {
+	t.Parallel()
+
+	cwd, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	cases := []struct {
+		name string
+		dir  string
+		want string
+	}{
+		{"empty", "", cwd},
+		{"dot", ".", cwd},
+		{"dotdot", "..", filepath.Dir(cwd)},
+		{"PWD", cwd, cwd},
+		{"PWDdotdot", cwd + string(filepath.Separator) + "..", filepath.Dir(cwd)},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			cmd := helperCommand(t, "pwd")
+			if cmd.Env != nil {
+				t.Fatalf("test requires helperCommand not to set Env field")
+			}
+			cmd.Dir = tc.dir
+
+			var pwds []string
+			for _, kv := range cmd.Environ() {
+				if strings.HasPrefix(kv, "PWD=") {
+					pwds = append(pwds, strings.TrimPrefix(kv, "PWD="))
+				}
+			}
+
+			wantPWDs := []string{tc.want}
+			if tc.dir == "" {
+				if _, ok := os.LookupEnv("PWD"); !ok {
+					wantPWDs = nil
+				}
+			}
+			if !reflect.DeepEqual(pwds, wantPWDs) {
+				t.Errorf("PWD entries in cmd.Environ():\n\t%s\nwant:\n\t%s", strings.Join(pwds, "\n\t"), strings.Join(wantPWDs, "\n\t"))
+			}
+
+			cmd.Stderr = new(strings.Builder)
+			out, err := cmd.Output()
+			if err != nil {
+				t.Fatalf("%v:\n%s", err, cmd.Stderr)
+			}
+			got := strings.Trim(string(out), "\r\n")
+			t.Logf("in\n\t%s\n`pwd` reported\n\t%s", tc.dir, got)
+			if got != tc.want {
+				t.Errorf("want\n\t%s", tc.want)
+			}
+		})
+	}
+}
+
+// However, if cmd.Env is set explicitly, setting Dir should not override it.
+// (This checks that the implementation for https://go.dev/issue/50599 doesn't
+// break existing users who may have explicitly mismatched the PWD variable.)
+func TestExplicitPWD(t *testing.T) {
+	maySkipHelperCommand("pwd")
+	testenv.MustHaveSymlink(t)
+
+	cwd, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	link := filepath.Join(t.TempDir(), "link")
+	if err := os.Symlink(cwd, link); err != nil {
+		t.Fatal(err)
+	}
+
+	// Now link is another equally-valid name for cwd. If we set Dir to one and
+	// PWD to the other, the subprocess should report the PWD version.
+	cases := []struct {
+		name string
+		dir  string
+		pwd  string
+	}{
+		{name: "original PWD", pwd: cwd},
+		{name: "link PWD", pwd: link},
+		{name: "in link with original PWD", dir: link, pwd: cwd},
+		{name: "in dir with link PWD", dir: cwd, pwd: link},
+		// Ideally we would also like to test what happens if we set PWD to
+		// something totally bogus (or the empty string), but then we would have no
+		// idea what output the subprocess should actually produce: cwd itself may
+		// contain symlinks preserved from the PWD value in the test's environment.
+	}
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			cmd := helperCommand(t, "pwd")
+			// This is intentionally opposite to the usual order of setting cmd.Dir
+			// and then calling cmd.Environ. Here, we *want* PWD not to match cmd.Dir,
+			// so we don't care whether cmd.Dir is reflected in cmd.Environ.
+			cmd.Env = append(cmd.Environ(), "PWD="+tc.pwd)
+			cmd.Dir = tc.dir
+
+			var pwds []string
+			for _, kv := range cmd.Environ() {
+				if strings.HasPrefix(kv, "PWD=") {
+					pwds = append(pwds, strings.TrimPrefix(kv, "PWD="))
+				}
+			}
+
+			wantPWDs := []string{tc.pwd}
+			if !reflect.DeepEqual(pwds, wantPWDs) {
+				t.Errorf("PWD entries in cmd.Environ():\n\t%s\nwant:\n\t%s", strings.Join(pwds, "\n\t"), strings.Join(wantPWDs, "\n\t"))
+			}
+
+			cmd.Stderr = new(strings.Builder)
+			out, err := cmd.Output()
+			if err != nil {
+				t.Fatalf("%v:\n%s", err, cmd.Stderr)
+			}
+			got := strings.Trim(string(out), "\r\n")
+			t.Logf("in\n\t%s\nwith PWD=%s\nsubprocess os.Getwd() reported\n\t%s", tc.dir, tc.pwd, got)
+			if got != tc.pwd {
+				t.Errorf("want\n\t%s", tc.pwd)
+			}
+		})
+	}
+}

diff --git a/src/os/exec/exec_test.go b/src/os/exec/exec_test.go
index 73aa35f..c593cbd 100644
--- a/src/os/exec/exec_test.go
+++ b/src/os/exec/exec_test.go

@@ -11,6 +11,7 @@
 	"bufio"
 	"bytes"
 	"context"
+	"flag"
 	"fmt"
 	"internal/poll"
 	"internal/testenv"
@@ -27,6 +28,7 @@
 	"runtime"
 	"strconv"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 )
@@ -36,7 +38,7 @@
 var haveUnexpectedFDs bool
 
 func init() {
-	if os.Getenv("GO_WANT_HELPER_PROCESS") == "1" {
+	if os.Getenv("GO_EXEC_TEST_PID") != "" {
 		return
 	}
 	if runtime.GOOS == "windows" {
@@ -54,22 +56,253 @@
 	}
 }
 
-func helperCommandContext(t *testing.T, ctx context.Context, s ...string) (cmd *exec.Cmd) {
+// TestMain allows the test binary to impersonate many other binaries,
+// some of which may manipulate os.Stdin, os.Stdout, and/or os.Stderr
+// (and thus cannot run as an ordinary Test function, since the testing
+// package monkey-patches those variables before running tests).
+func TestMain(m *testing.M) {
+	flag.Parse()
+
+	pid := os.Getpid()
+	if os.Getenv("GO_EXEC_TEST_PID") == "" {
+		os.Setenv("GO_EXEC_TEST_PID", strconv.Itoa(pid))
+
+		code := m.Run()
+		if code == 0 && flag.Lookup("test.run").Value.String() == "" && flag.Lookup("test.list").Value.String() == "" {
+			for cmd := range helperCommands {
+				if _, ok := helperCommandUsed.Load(cmd); !ok {
+					fmt.Fprintf(os.Stderr, "helper command unused: %q\n", cmd)
+					code = 1
+				}
+			}
+		}
+		os.Exit(code)
+	}
+
+	args := flag.Args()
+	if len(args) == 0 {
+		fmt.Fprintf(os.Stderr, "No command\n")
+		os.Exit(2)
+	}
+
+	cmd, args := args[0], args[1:]
+	f, ok := helperCommands[cmd]
+	if !ok {
+		fmt.Fprintf(os.Stderr, "Unknown command %q\n", cmd)
+		os.Exit(2)
+	}
+	f(args...)
+	os.Exit(0)
+}
+
+// registerHelperCommand registers a command that the test process can impersonate.
+// A command should be registered in the same source file in which it is used.
+// If all tests are run and pass, all registered commands must be used.
+// (This prevents stale commands from accreting if tests are removed or
+// refactored over time.)
+func registerHelperCommand(name string, f func(...string)) {
+	if helperCommands[name] != nil {
+		panic("duplicate command registered: " + name)
+	}
+	helperCommands[name] = f
+}
+
+// maySkipHelperCommand records that the test that uses the named helper command
+// was invoked, but may call Skip on the test before actually calling
+// helperCommand.
+func maySkipHelperCommand(name string) {
+	helperCommandUsed.Store(name, true)
+}
+
+// helperCommand returns an exec.Cmd that will run the named helper command.
+func helperCommand(t *testing.T, name string, args ...string) *exec.Cmd {
+	t.Helper()
+	return helperCommandContext(t, nil, name, args...)
+}
+
+// helperCommandContext is like helperCommand, but also accepts a Context under
+// which to run the command.
+func helperCommandContext(t *testing.T, ctx context.Context, name string, args ...string) (cmd *exec.Cmd) {
+	helperCommandUsed.LoadOrStore(name, true)
+
+	t.Helper()
 	testenv.MustHaveExec(t)
 
-	cs := []string{"-test.run=TestHelperProcess", "--"}
-	cs = append(cs, s...)
+	cs := append([]string{name}, args...)
 	if ctx != nil {
-		cmd = exec.CommandContext(ctx, os.Args[0], cs...)
+		cmd = exec.CommandContext(ctx, exePath(t), cs...)
 	} else {
-		cmd = exec.Command(os.Args[0], cs...)
+		cmd = exec.Command(exePath(t), cs...)
 	}
-	cmd.Env = append(os.Environ(), "GO_WANT_HELPER_PROCESS=1")
 	return cmd
 }
 
-func helperCommand(t *testing.T, s ...string) *exec.Cmd {
-	return helperCommandContext(t, nil, s...)
+// exePath returns the path to the running executable.
+func exePath(t testing.TB) string {
+	exeOnce.Do(func() {
+		// Use os.Executable instead of os.Args[0] in case the caller modifies
+		// cmd.Dir: if the test binary is invoked like "./exec.test", it should
+		// not fail spuriously.
+		exeOnce.path, exeOnce.err = os.Executable()
+	})
+
+	if exeOnce.err != nil {
+		if t == nil {
+			panic(exeOnce.err)
+		}
+		t.Fatal(exeOnce.err)
+	}
+
+	return exeOnce.path
+}
+
+var exeOnce struct {
+	path string
+	err  error
+	sync.Once
+}
+
+var helperCommandUsed sync.Map
+
+var helperCommands = map[string]func(...string){
+	"echo":               cmdEcho,
+	"echoenv":            cmdEchoEnv,
+	"cat":                cmdCat,
+	"pipetest":           cmdPipeTest,
+	"stdinClose":         cmdStdinClose,
+	"exit":               cmdExit,
+	"describefiles":      cmdDescribeFiles,
+	"extraFilesAndPipes": cmdExtraFilesAndPipes,
+	"stderrfail":         cmdStderrFail,
+	"yes":                cmdYes,
+}
+
+func cmdEcho(args ...string) {
+	iargs := []any{}
+	for _, s := range args {
+		iargs = append(iargs, s)
+	}
+	fmt.Println(iargs...)
+}
+
+func cmdEchoEnv(args ...string) {
+	for _, s := range args {
+		fmt.Println(os.Getenv(s))
+	}
+}
+
+func cmdCat(args ...string) {
+	if len(args) == 0 {
+		io.Copy(os.Stdout, os.Stdin)
+		return
+	}
+	exit := 0
+	for _, fn := range args {
+		f, err := os.Open(fn)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "Error: %v\n", err)
+			exit = 2
+		} else {
+			defer f.Close()
+			io.Copy(os.Stdout, f)
+		}
+	}
+	os.Exit(exit)
+}
+
+func cmdPipeTest(...string) {
+	bufr := bufio.NewReader(os.Stdin)
+	for {
+		line, _, err := bufr.ReadLine()
+		if err == io.EOF {
+			break
+		} else if err != nil {
+			os.Exit(1)
+		}
+		if bytes.HasPrefix(line, []byte("O:")) {
+			os.Stdout.Write(line)
+			os.Stdout.Write([]byte{'\n'})
+		} else if bytes.HasPrefix(line, []byte("E:")) {
+			os.Stderr.Write(line)
+			os.Stderr.Write([]byte{'\n'})
+		} else {
+			os.Exit(1)
+		}
+	}
+}
+
+func cmdStdinClose(...string) {
+	b, err := io.ReadAll(os.Stdin)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Error: %v\n", err)
+		os.Exit(1)
+	}
+	if s := string(b); s != stdinCloseTestString {
+		fmt.Fprintf(os.Stderr, "Error: Read %q, want %q", s, stdinCloseTestString)
+		os.Exit(1)
+	}
+}
+
+func cmdExit(args ...string) {
+	n, _ := strconv.Atoi(args[0])
+	os.Exit(n)
+}
+
+func cmdDescribeFiles(args ...string) {
+	f := os.NewFile(3, fmt.Sprintf("fd3"))
+	ln, err := net.FileListener(f)
+	if err == nil {
+		fmt.Printf("fd3: listener %s\n", ln.Addr())
+		ln.Close()
+	}
+}
+
+func cmdExtraFilesAndPipes(args ...string) {
+	n, _ := strconv.Atoi(args[0])
+	pipes := make([]*os.File, n)
+	for i := 0; i < n; i++ {
+		pipes[i] = os.NewFile(uintptr(3+i), strconv.Itoa(i))
+	}
+	response := ""
+	for i, r := range pipes {
+		ch := make(chan string, 1)
+		go func(c chan string) {
+			buf := make([]byte, 10)
+			n, err := r.Read(buf)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "Child: read error: %v on pipe %d\n", err, i)
+				os.Exit(1)
+			}
+			c <- string(buf[:n])
+			close(c)
+		}(ch)
+		select {
+		case m := <-ch:
+			response = response + m
+		case <-time.After(5 * time.Second):
+			fmt.Fprintf(os.Stderr, "Child: Timeout reading from pipe: %d\n", i)
+			os.Exit(1)
+		}
+	}
+	fmt.Fprintf(os.Stderr, "child: %s", response)
+}
+
+func cmdStderrFail(...string) {
+	fmt.Fprintf(os.Stderr, "some stderr text\n")
+	os.Exit(1)
+}
+
+func cmdYes(args ...string) {
+	if len(args) == 0 {
+		args = []string{"y"}
+	}
+	s := strings.Join(args, " ") + "\n"
+	for {
+		_, err := os.Stdout.WriteString(s)
+		if err != nil {
+			os.Exit(1)
+		}
+	}
 }
 
 func TestEcho(t *testing.T) {
@@ -83,7 +316,7 @@
 }
 
 func TestCommandRelativeName(t *testing.T) {
-	testenv.MustHaveExec(t)
+	cmd := helperCommand(t, "echo", "foo")
 
 	// Run our own binary as a relative path
 	// (e.g. "_test/exec.test") our parent directory.
@@ -98,9 +331,8 @@
 		t.Skipf("skipping; unexpected shallow dir of %q", dir)
 	}
 
-	cmd := exec.Command(filepath.Join(dirBase, base), "-test.run=TestHelperProcess", "--", "echo", "foo")
+	cmd.Path = filepath.Join(dirBase, base)
 	cmd.Dir = parentDir
-	cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
 
 	out, err := cmd.Output()
 	if err != nil {
@@ -159,7 +391,7 @@
 	if !strings.HasPrefix(errLine, "Error: open /bogus/file.foo") {
 		t.Errorf("expected stderr to complain about file; got %q", errLine)
 	}
-	if !strings.Contains(body, "func TestHelperProcess(t *testing.T)") {
+	if !strings.Contains(body, "func TestCatGoodAndBadFile(t *testing.T)") {
 		t.Errorf("expected test code; got %q (len %d)", body, len(body))
 	}
 }
@@ -394,6 +626,7 @@
 }
 
 func TestExtraFilesFDShuffle(t *testing.T) {
+	maySkipHelperCommand("extraFilesAndPipes")
 	testenv.SkipFlaky(t, 5780)
 	switch runtime.GOOS {
 	case "windows":
@@ -619,6 +852,7 @@
 
 func TestExtraFilesRace(t *testing.T) {
 	if runtime.GOOS == "windows" {
+		maySkipHelperCommand("describefiles")
 		t.Skip("no operating system support; skipping")
 	}
 	listen := func() net.Listener {
@@ -676,167 +910,6 @@
 	}
 }
 
-// TestHelperProcess isn't a real test. It's used as a helper process
-// for TestParameterRun.
-func TestHelperProcess(*testing.T) {
-	if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
-		return
-	}
-	defer os.Exit(0)
-
-	args := os.Args
-	for len(args) > 0 {
-		if args[0] == "--" {
-			args = args[1:]
-			break
-		}
-		args = args[1:]
-	}
-	if len(args) == 0 {
-		fmt.Fprintf(os.Stderr, "No command\n")
-		os.Exit(2)
-	}
-
-	cmd, args := args[0], args[1:]
-	switch cmd {
-	case "echo":
-		iargs := []any{}
-		for _, s := range args {
-			iargs = append(iargs, s)
-		}
-		fmt.Println(iargs...)
-	case "echoenv":
-		for _, s := range args {
-			fmt.Println(os.Getenv(s))
-		}
-		os.Exit(0)
-	case "cat":
-		if len(args) == 0 {
-			io.Copy(os.Stdout, os.Stdin)
-			return
-		}
-		exit := 0
-		for _, fn := range args {
-			f, err := os.Open(fn)
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "Error: %v\n", err)
-				exit = 2
-			} else {
-				defer f.Close()
-				io.Copy(os.Stdout, f)
-			}
-		}
-		os.Exit(exit)
-	case "pipetest":
-		bufr := bufio.NewReader(os.Stdin)
-		for {
-			line, _, err := bufr.ReadLine()
-			if err == io.EOF {
-				break
-			} else if err != nil {
-				os.Exit(1)
-			}
-			if bytes.HasPrefix(line, []byte("O:")) {
-				os.Stdout.Write(line)
-				os.Stdout.Write([]byte{'\n'})
-			} else if bytes.HasPrefix(line, []byte("E:")) {
-				os.Stderr.Write(line)
-				os.Stderr.Write([]byte{'\n'})
-			} else {
-				os.Exit(1)
-			}
-		}
-	case "stdinClose":
-		b, err := io.ReadAll(os.Stdin)
-		if err != nil {
-			fmt.Fprintf(os.Stderr, "Error: %v\n", err)
-			os.Exit(1)
-		}
-		if s := string(b); s != stdinCloseTestString {
-			fmt.Fprintf(os.Stderr, "Error: Read %q, want %q", s, stdinCloseTestString)
-			os.Exit(1)
-		}
-		os.Exit(0)
-	case "exit":
-		n, _ := strconv.Atoi(args[0])
-		os.Exit(n)
-	case "describefiles":
-		f := os.NewFile(3, fmt.Sprintf("fd3"))
-		ln, err := net.FileListener(f)
-		if err == nil {
-			fmt.Printf("fd3: listener %s\n", ln.Addr())
-			ln.Close()
-		}
-		os.Exit(0)
-	case "extraFilesAndPipes":
-		n, _ := strconv.Atoi(args[0])
-		pipes := make([]*os.File, n)
-		for i := 0; i < n; i++ {
-			pipes[i] = os.NewFile(uintptr(3+i), strconv.Itoa(i))
-		}
-		response := ""
-		for i, r := range pipes {
-			ch := make(chan string, 1)
-			go func(c chan string) {
-				buf := make([]byte, 10)
-				n, err := r.Read(buf)
-				if err != nil {
-					fmt.Fprintf(os.Stderr, "Child: read error: %v on pipe %d\n", err, i)
-					os.Exit(1)
-				}
-				c <- string(buf[:n])
-				close(c)
-			}(ch)
-			select {
-			case m := <-ch:
-				response = response + m
-			case <-time.After(5 * time.Second):
-				fmt.Fprintf(os.Stderr, "Child: Timeout reading from pipe: %d\n", i)
-				os.Exit(1)
-			}
-		}
-		fmt.Fprintf(os.Stderr, "child: %s", response)
-		os.Exit(0)
-	case "exec":
-		cmd := exec.Command(args[1])
-		cmd.Dir = args[0]
-		output, err := cmd.CombinedOutput()
-		if err != nil {
-			fmt.Fprintf(os.Stderr, "Child: %s %s", err, string(output))
-			os.Exit(1)
-		}
-		fmt.Printf("%s", string(output))
-		os.Exit(0)
-	case "lookpath":
-		p, err := exec.LookPath(args[0])
-		if err != nil {
-			fmt.Fprintf(os.Stderr, "LookPath failed: %v\n", err)
-			os.Exit(1)
-		}
-		fmt.Print(p)
-		os.Exit(0)
-	case "stderrfail":
-		fmt.Fprintf(os.Stderr, "some stderr text\n")
-		os.Exit(1)
-	case "sleep":
-		time.Sleep(3 * time.Second)
-		os.Exit(0)
-	case "pipehandle":
-		handle, _ := strconv.ParseUint(args[0], 16, 64)
-		pipe := os.NewFile(uintptr(handle), "")
-		_, err := fmt.Fprint(pipe, args[1])
-		if err != nil {
-			fmt.Fprintf(os.Stderr, "writing to pipe failed: %v\n", err)
-			os.Exit(1)
-		}
-		pipe.Close()
-		os.Exit(0)
-	default:
-		fmt.Fprintf(os.Stderr, "Unknown command %q\n", cmd)
-		os.Exit(2)
-	}
-}
-
 type delayedInfiniteReader struct{}
 
 func (delayedInfiniteReader) Read(b []byte) (int, error) {
@@ -849,8 +922,6 @@
 
 // Issue 9173: ignore stdin pipe writes if the program completes successfully.
 func TestIgnorePipeErrorOnSuccess(t *testing.T) {
-	testenv.MustHaveExec(t)
-
 	testWith := func(r io.Reader) func(*testing.T) {
 		return func(t *testing.T) {
 			cmd := helperCommand(t, "echo", "foo")
@@ -876,12 +947,7 @@
 }
 
 func TestClosePipeOnCopyError(t *testing.T) {
-	testenv.MustHaveExec(t)
-
-	if runtime.GOOS == "windows" || runtime.GOOS == "plan9" {
-		t.Skipf("skipping test on %s - no yes command", runtime.GOOS)
-	}
-	cmd := exec.Command("yes")
+	cmd := helperCommand(t, "yes")
 	cmd.Stdout = new(badWriter)
 	c := make(chan int, 1)
 	go func() {
@@ -900,8 +966,6 @@
 }
 
 func TestOutputStderrCapture(t *testing.T) {
-	testenv.MustHaveExec(t)
-
 	cmd := helperCommand(t, "stderrfail")
 	_, err := cmd.Output()
 	ee, ok := err.(*exec.ExitError)
@@ -955,6 +1019,7 @@
 
 func TestContextCancel(t *testing.T) {
 	if runtime.GOOS == "netbsd" && runtime.GOARCH == "arm64" {
+		maySkipHelperCommand("cat")
 		testenv.SkipFlaky(t, 42061)
 	}
 
@@ -1016,10 +1081,8 @@
 
 // test that environment variables are de-duped.
 func TestDedupEnvEcho(t *testing.T) {
-	testenv.MustHaveExec(t)
-
 	cmd := helperCommand(t, "echoenv", "FOO")
-	cmd.Env = append(cmd.Env, "FOO=bad", "FOO=good")
+	cmd.Env = append(cmd.Environ(), "FOO=bad", "FOO=good")
 	out, err := cmd.CombinedOutput()
 	if err != nil {
 		t.Fatal(err)
@@ -1062,22 +1125,3 @@
 		t.Errorf("String(%q, %q) = %q, want %q", "makemeasandwich", "-lettuce", got, want)
 	}
 }
-
-// start a child process without the user code explicitly starting
-// with a copy of the parent's. (The Windows SYSTEMROOT issue: Issue
-// 25210)
-func TestChildCriticalEnv(t *testing.T) {
-	testenv.MustHaveExec(t)
-	if runtime.GOOS != "windows" {
-		t.Skip("only testing on Windows")
-	}
-	cmd := helperCommand(t, "echoenv", "SYSTEMROOT")
-	cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		t.Fatal(err)
-	}
-	if strings.TrimSpace(string(out)) == "" {
-		t.Error("no SYSTEMROOT found")
-	}
-}

diff --git a/src/os/exec/exec_unix.go b/src/os/exec/exec_unix.go
index c20f352..3ed672a 100644
--- a/src/os/exec/exec_unix.go
+++ b/src/os/exec/exec_unix.go

@@ -11,14 +11,14 @@
 	"syscall"
 )
 
-func init() {
-	skipStdinCopyError = func(err error) bool {
-		// Ignore EPIPE errors copying to stdin if the program
-		// completed successfully otherwise.
-		// See Issue 9173.
-		pe, ok := err.(*fs.PathError)
-		return ok &&
-			pe.Op == "write" && pe.Path == "|1" &&
-			pe.Err == syscall.EPIPE
-	}
+// skipStdinCopyError optionally specifies a function which reports
+// whether the provided stdin copy error should be ignored.
+func skipStdinCopyError(err error) bool {
+	// Ignore EPIPE errors copying to stdin if the program
+	// completed successfully otherwise.
+	// See Issue 9173.
+	pe, ok := err.(*fs.PathError)
+	return ok &&
+		pe.Op == "write" && pe.Path == "|1" &&
+		pe.Err == syscall.EPIPE
 }

diff --git a/src/os/exec/exec_windows.go b/src/os/exec/exec_windows.go
index bb937f8..e7a2ee6 100644
--- a/src/os/exec/exec_windows.go
+++ b/src/os/exec/exec_windows.go

@@ -9,15 +9,15 @@
 	"syscall"
 )
 
-func init() {
-	skipStdinCopyError = func(err error) bool {
-		// Ignore ERROR_BROKEN_PIPE and ERROR_NO_DATA errors copying
-		// to stdin if the program completed successfully otherwise.
-		// See Issue 20445.
-		const _ERROR_NO_DATA = syscall.Errno(0xe8)
-		pe, ok := err.(*fs.PathError)
-		return ok &&
-			pe.Op == "write" && pe.Path == "|1" &&
-			(pe.Err == syscall.ERROR_BROKEN_PIPE || pe.Err == _ERROR_NO_DATA)
-	}
+// skipStdinCopyError optionally specifies a function which reports
+// whether the provided stdin copy error should be ignored.
+func skipStdinCopyError(err error) bool {
+	// Ignore ERROR_BROKEN_PIPE and ERROR_NO_DATA errors copying
+	// to stdin if the program completed successfully otherwise.
+	// See Issue 20445.
+	const _ERROR_NO_DATA = syscall.Errno(0xe8)
+	pe, ok := err.(*fs.PathError)
+	return ok &&
+		pe.Op == "write" && pe.Path == "|1" &&
+		(pe.Err == syscall.ERROR_BROKEN_PIPE || pe.Err == _ERROR_NO_DATA)
 }

diff --git a/src/os/exec/exec_windows_test.go b/src/os/exec/exec_windows_test.go
index 8e31e47..35ae0b0 100644
--- a/src/os/exec/exec_windows_test.go
+++ b/src/os/exec/exec_windows_test.go

@@ -7,14 +7,31 @@
 package exec_test
 
 import (
+	"fmt"
 	"io"
 	"os"
 	"os/exec"
 	"strconv"
+	"strings"
 	"syscall"
 	"testing"
 )
 
+func init() {
+	registerHelperCommand("pipehandle", cmdPipeHandle)
+}
+
+func cmdPipeHandle(args ...string) {
+	handle, _ := strconv.ParseUint(args[0], 16, 64)
+	pipe := os.NewFile(uintptr(handle), "")
+	_, err := fmt.Fprint(pipe, args[1])
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "writing to pipe failed: %v\n", err)
+		os.Exit(1)
+	}
+	pipe.Close()
+}
+
 func TestPipePassing(t *testing.T) {
 	r, w, err := os.Pipe()
 	if err != nil {
@@ -54,3 +71,28 @@
 		t.Fatalf("got exit code %d; want 88", exitError.ExitCode())
 	}
 }
+
+// start a child process without the user code explicitly starting
+// with a copy of the parent's SYSTEMROOT.
+// (See issue 25210.)
+func TestChildCriticalEnv(t *testing.T) {
+	cmd := helperCommand(t, "echoenv", "SYSTEMROOT")
+
+	// Explicitly remove SYSTEMROOT from the command's environment.
+	var env []string
+	for _, kv := range cmd.Environ() {
+		k, _, ok := strings.Cut(kv, "=")
+		if !ok || !strings.EqualFold(k, "SYSTEMROOT") {
+			env = append(env, kv)
+		}
+	}
+	cmd.Env = env
+
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if strings.TrimSpace(string(out)) == "" {
+		t.Error("no SYSTEMROOT found")
+	}
+}

diff --git a/src/os/exec/lp_windows_test.go b/src/os/exec/lp_windows_test.go
index bbf6a9b..34abe09 100644
--- a/src/os/exec/lp_windows_test.go
+++ b/src/os/exec/lp_windows_test.go

@@ -19,6 +19,31 @@
 	"testing"
 )
 
+func init() {
+	registerHelperCommand("exec", cmdExec)
+	registerHelperCommand("lookpath", cmdLookPath)
+}
+
+func cmdLookPath(args ...string) {
+	p, err := exec.LookPath(args[0])
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "LookPath failed: %v\n", err)
+		os.Exit(1)
+	}
+	fmt.Print(p)
+}
+
+func cmdExec(args ...string) {
+	cmd := exec.Command(args[1])
+	cmd.Dir = args[0]
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Child: %s %s", err, string(output))
+		os.Exit(1)
+	}
+	fmt.Printf("%s", string(output))
+}
+
 func installExe(t *testing.T, dest, src string) {
 	fsrc, err := os.Open(src)
 	if err != nil {
@@ -66,10 +91,10 @@
 	fails     bool // test is expected to fail
 }
 
-func (test lookPathTest) runProg(t *testing.T, env []string, args ...string) (string, error) {
-	cmd := exec.Command(args[0], args[1:]...)
+func (test lookPathTest) runProg(t *testing.T, env []string, cmd *exec.Cmd) (string, error) {
 	cmd.Env = env
 	cmd.Dir = test.rootDir
+	args := append([]string(nil), cmd.Args...)
 	args[0] = filepath.Base(args[0])
 	cmdText := fmt.Sprintf("%q command", strings.Join(args, " "))
 	out, err := cmd.CombinedOutput()
@@ -135,10 +160,9 @@
 	// Run "cmd.exe /c test.searchFor" with new environment and
 	// work directory set. All candidates are copies of printpath.exe.
 	// These will output their program paths when run.
-	should, errCmd := test.runProg(t, env, "cmd", "/c", test.searchFor)
+	should, errCmd := test.runProg(t, env, exec.Command("cmd", "/c", test.searchFor))
 	// Run the lookpath program with new environment and work directory set.
-	env = append(env, "GO_WANT_HELPER_PROCESS=1")
-	have, errLP := test.runProg(t, env, os.Args[0], "-test.run=TestHelperProcess", "--", "lookpath", test.searchFor)
+	have, errLP := test.runProg(t, env, helperCommand(t, "lookpath", test.searchFor))
 	// Compare results.
 	if errCmd == nil && errLP == nil {
 		// both succeeded
@@ -346,30 +370,26 @@
 	return nil
 }
 
-func (test commandTest) runOne(rootDir string, env []string, dir, arg0 string) error {
-	cmd := exec.Command(os.Args[0], "-test.run=TestHelperProcess", "--", "exec", dir, arg0)
+func (test commandTest) runOne(t *testing.T, rootDir string, env []string, dir, arg0 string) {
+	cmd := helperCommand(t, "exec", dir, arg0)
 	cmd.Dir = rootDir
 	cmd.Env = env
 	output, err := cmd.CombinedOutput()
 	err = test.isSuccess(rootDir, string(output), err)
 	if (err != nil) != test.fails {
 		if test.fails {
-			return fmt.Errorf("test=%+v: succeeded, but expected to fail", test)
+			t.Errorf("test=%+v: succeeded, but expected to fail", test)
+		} else {
+			t.Error(err)
 		}
-		return err
 	}
-	return nil
 }
 
 func (test commandTest) run(t *testing.T, rootDir, printpathExe string) {
 	createFiles(t, rootDir, test.files, printpathExe)
 	PATHEXT := `.COM;.EXE;.BAT`
 	env := createEnv(rootDir, test.PATH, PATHEXT)
-	env = append(env, "GO_WANT_HELPER_PROCESS=1")
-	err := test.runOne(rootDir, env, test.dir, test.arg0)
-	if err != nil {
-		t.Error(err)
-	}
+	test.runOne(t, rootDir, env, test.dir, test.arg0)
 }
 
 var commandTests = []commandTest{

diff --git a/src/os/exec/read3.go b/src/os/exec/read3.go
index 10cbfbd..8327d73 100644
--- a/src/os/exec/read3.go
+++ b/src/os/exec/read3.go

@@ -6,7 +6,7 @@
 
 // This is a test program that verifies that it can read from
 // descriptor 3 and that no other descriptors are open.
-// This is not done via TestHelperProcess and GO_WANT_HELPER_PROCESS
+// This is not done via TestHelperProcess and GO_EXEC_TEST_PID
 // because we want to ensure that this program does not use cgo,
 // because C libraries can open file descriptors behind our backs
 // and confuse the test. See issue 25628.

diff --git a/src/os/os_test.go b/src/os/os_test.go
index d071b47..df00f16 100644
--- a/src/os/os_test.go
+++ b/src/os/os_test.go

@@ -2429,6 +2429,9 @@
 		// like it does on Unix.
 		t.Skip("skipping on windows")
 	}
+	if runtime.GOOS == "dragonfly" {
+		testenv.SkipFlaky(t, 52301)
+	}
 
 	n := runtime.GOMAXPROCS(16)
 	defer runtime.GOMAXPROCS(n)

diff --git a/src/os/rlimit_test.go b/src/os/rlimit_test.go
index 58a6a05..c02e36f 100644
--- a/src/os/rlimit_test.go
+++ b/src/os/rlimit_test.go

@@ -11,18 +11,21 @@
 )
 
 func TestOpenFileLimit(t *testing.T) {
-	if runtime.GOOS == "openbsd" && (runtime.GOARCH == "arm" || runtime.GOARCH == "arm64" || runtime.GOARCH == "mips64") {
-		t.Skip("broken on openbsd/arm, openbsd/arm64, openbsd/mips64 builder - go.dev/issue/51713")
-	}
-
 	// For open file count,
 	// macOS sets the default soft limit to 256 and no hard limit.
 	// CentOS and Fedora set the default soft limit to 1024,
 	// with hard limits of 4096 and 524288, respectively.
 	// Check that we can open 1200 files, which proves
 	// that the rlimit is being raised appropriately on those systems.
+	fileCount := 1200
+
+	// OpenBSD has a default soft limit of 512 and hard limit of 1024.
+	if runtime.GOOS == "openbsd" {
+		fileCount = 768
+	}
+
 	var files []*File
-	for i := 0; i < 1200; i++ {
+	for i := 0; i < fileCount; i++ {
 		f, err := Open("rlimit.go")
 		if err != nil {
 			t.Error(err)

diff --git a/src/reflect/all_test.go b/src/reflect/all_test.go
index a886f9f..9eb01bd 100644
--- a/src/reflect/all_test.go
+++ b/src/reflect/all_test.go

@@ -7823,3 +7823,159 @@
 		t.Fatalf("Kind(-1).String() = %q, want %q", s, want)
 	}
 }
+
+type (
+	namedBool  bool
+	namedBytes []byte
+)
+
+var sourceAll = struct {
+	Bool         Value
+	String       Value
+	Bytes        Value
+	NamedBytes   Value
+	BytesArray   Value
+	SliceAny     Value
+	MapStringAny Value
+}{
+	Bool:         ValueOf(new(bool)).Elem(),
+	String:       ValueOf(new(string)).Elem(),
+	Bytes:        ValueOf(new([]byte)).Elem(),
+	NamedBytes:   ValueOf(new(namedBytes)).Elem(),
+	BytesArray:   ValueOf(new([32]byte)).Elem(),
+	SliceAny:     ValueOf(new([]any)).Elem(),
+	MapStringAny: ValueOf(new(map[string]any)).Elem(),
+}
+
+var sinkAll struct {
+	RawBool   bool
+	RawString string
+	RawBytes  []byte
+	RawInt    int
+}
+
+func BenchmarkBool(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		sinkAll.RawBool = sourceAll.Bool.Bool()
+	}
+}
+
+func BenchmarkString(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		sinkAll.RawString = sourceAll.String.String()
+	}
+}
+
+func BenchmarkBytes(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		sinkAll.RawBytes = sourceAll.Bytes.Bytes()
+	}
+}
+
+func BenchmarkNamedBytes(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		sinkAll.RawBytes = sourceAll.NamedBytes.Bytes()
+	}
+}
+
+func BenchmarkBytesArray(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		sinkAll.RawBytes = sourceAll.BytesArray.Bytes()
+	}
+}
+
+func BenchmarkSliceLen(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		sinkAll.RawInt = sourceAll.SliceAny.Len()
+	}
+}
+
+func BenchmarkMapLen(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		sinkAll.RawInt = sourceAll.MapStringAny.Len()
+	}
+}
+
+func BenchmarkStringLen(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		sinkAll.RawInt = sourceAll.String.Len()
+	}
+}
+
+func BenchmarkArrayLen(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		sinkAll.RawInt = sourceAll.BytesArray.Len()
+	}
+}
+
+func BenchmarkSliceCap(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		sinkAll.RawInt = sourceAll.SliceAny.Cap()
+	}
+}
+
+func TestValue_Cap(t *testing.T) {
+	a := &[3]int{1, 2, 3}
+	v := ValueOf(a)
+	if v.Cap() != cap(a) {
+		t.Errorf("Cap = %d want %d", v.Cap(), cap(a))
+	}
+
+	a = nil
+	v = ValueOf(a)
+	if v.Cap() != cap(a) {
+		t.Errorf("Cap = %d want %d", v.Cap(), cap(a))
+	}
+
+	getError := func(f func()) (errorStr string) {
+		defer func() {
+			e := recover()
+			if str, ok := e.(string); ok {
+				errorStr = str
+			}
+		}()
+		f()
+		return
+	}
+	e := getError(func() {
+		var ptr *int
+		ValueOf(ptr).Cap()
+	})
+	wantStr := "reflect: call of reflect.Value.Cap on ptr to non-array Value"
+	if e != wantStr {
+		t.Errorf("error is %q, want %q", e, wantStr)
+	}
+}
+
+func TestValue_Len(t *testing.T) {
+	a := &[3]int{1, 2, 3}
+	v := ValueOf(a)
+	if v.Len() != len(a) {
+		t.Errorf("Len = %d want %d", v.Len(), len(a))
+	}
+
+	a = nil
+	v = ValueOf(a)
+	if v.Len() != len(a) {
+		t.Errorf("Len = %d want %d", v.Len(), len(a))
+	}
+
+	getError := func(f func()) (errorStr string) {
+		defer func() {
+			e := recover()
+			if str, ok := e.(string); ok {
+				errorStr = str
+			}
+		}()
+		f()
+		return
+	}
+	e := getError(func() {
+		var ptr *int
+		ValueOf(ptr).Len()
+	})
+	wantStr := "reflect: call of reflect.Value.Len on ptr to non-array Value"
+	if e != wantStr {
+		t.Errorf("error is %q, want %q", e, wantStr)
+	}
+}

diff --git a/src/reflect/type.go b/src/reflect/type.go
index e888266..97040b5 100644
--- a/src/reflect/type.go
+++ b/src/reflect/type.go

@@ -2244,15 +2244,14 @@
 	}
 
 	// Prepare GC data if any.
-	// A bucket is at most bucketSize*(1+maxKeySize+maxValSize)+2*ptrSize bytes,
-	// or 2072 bytes, or 259 pointer-size words, or 33 bytes of pointer bitmap.
+	// A bucket is at most bucketSize*(1+maxKeySize+maxValSize)+ptrSize bytes,
+	// or 2064 bytes, or 258 pointer-size words, or 33 bytes of pointer bitmap.
 	// Note that since the key and value are known to be <= 128 bytes,
 	// they're guaranteed to have bitmaps instead of GC programs.
 	var gcdata *byte
 	var ptrdata uintptr
-	var overflowPad uintptr
 
-	size := bucketSize*(1+ktyp.size+etyp.size) + overflowPad + goarch.PtrSize
+	size := bucketSize*(1+ktyp.size+etyp.size) + goarch.PtrSize
 	if size&uintptr(ktyp.align-1) != 0 || size&uintptr(etyp.align-1) != 0 {
 		panic("reflect: bad size computation in MapOf")
 	}
@@ -2271,7 +2270,6 @@
 			emitGCMask(mask, base, etyp, bucketSize)
 		}
 		base += bucketSize * etyp.size / goarch.PtrSize
-		base += overflowPad / goarch.PtrSize
 
 		word := base
 		mask[word/8] |= 1 << (word % 8)
@@ -2291,9 +2289,6 @@
 		ptrdata: ptrdata,
 		gcdata:  gcdata,
 	}
-	if overflowPad > 0 {
-		b.align = 8
-	}
 	s := "bucket(" + ktyp.String() + "," + etyp.String() + ")"
 	b.str = resolveReflectName(newName(s, "", false))
 	return b

diff --git a/src/reflect/value.go b/src/reflect/value.go
index 6fe3cee..f92fa16 100644
--- a/src/reflect/value.go
+++ b/src/reflect/value.go

@@ -281,14 +281,31 @@
 // Bool returns v's underlying value.
 // It panics if v's kind is not Bool.
 func (v Value) Bool() bool {
-	v.mustBe(Bool)
+	// panicNotBool is split out to keep Bool inlineable.
+	if v.kind() != Bool {
+		v.panicNotBool()
+	}
 	return *(*bool)(v.ptr)
 }
 
+func (v Value) panicNotBool() {
+	v.mustBe(Bool)
+}
+
+var bytesType = TypeOf(([]byte)(nil)).(*rtype)
+
 // Bytes returns v's underlying value.
 // It panics if v's underlying value is not a slice of bytes or
 // an addressable array of bytes.
 func (v Value) Bytes() []byte {
+	// bytesSlow is split out to keep Bytes inlineable for unnamed []byte.
+	if v.typ == bytesType {
+		return *(*[]byte)(v.ptr)
+	}
+	return v.bytesSlow()
+}
+
+func (v Value) bytesSlow() []byte {
 	switch v.kind() {
 	case Slice:
 		if v.typ.Elem().Kind() != Uint8 {
@@ -1127,17 +1144,27 @@
 }
 
 // Cap returns v's capacity.
-// It panics if v's Kind is not Array, Chan, or Slice.
+// It panics if v's Kind is not Array, Chan, Slice or pointer to Array.
 func (v Value) Cap() int {
+	// capNonSlice is split out to keep Cap inlineable for slice kinds.
+	if v.kind() == Slice {
+		return (*unsafeheader.Slice)(v.ptr).Cap
+	}
+	return v.capNonSlice()
+}
+
+func (v Value) capNonSlice() int {
 	k := v.kind()
 	switch k {
 	case Array:
 		return v.typ.Len()
 	case Chan:
 		return chancap(v.pointer())
-	case Slice:
-		// Slice is always bigger than a word; assume flagIndir.
-		return (*unsafeheader.Slice)(v.ptr).Cap
+	case Ptr:
+		if v.typ.Elem().Kind() == Array {
+			return v.typ.Elem().Len()
+		}
+		panic("reflect: call of reflect.Value.Cap on ptr to non-array Value")
 	}
 	panic(&ValueError{"reflect.Value.Cap", v.kind()})
 }
@@ -1578,10 +1605,17 @@
 }
 
 // Len returns v's length.
-// It panics if v's Kind is not Array, Chan, Map, Slice, or String.
+// It panics if v's Kind is not Array, Chan, Map, Slice, String, or pointer to Array.
 func (v Value) Len() int {
-	k := v.kind()
-	switch k {
+	// lenNonSlice is split out to keep Len inlineable for slice kinds.
+	if v.kind() == Slice {
+		return (*unsafeheader.Slice)(v.ptr).Len
+	}
+	return v.lenNonSlice()
+}
+
+func (v Value) lenNonSlice() int {
+	switch k := v.kind(); k {
 	case Array:
 		tt := (*arrayType)(unsafe.Pointer(v.typ))
 		return int(tt.len)
@@ -1589,12 +1623,14 @@
 		return chanlen(v.pointer())
 	case Map:
 		return maplen(v.pointer())
-	case Slice:
-		// Slice is bigger than a word; assume flagIndir.
-		return (*unsafeheader.Slice)(v.ptr).Len
 	case String:
 		// String is bigger than a word; assume flagIndir.
 		return (*unsafeheader.String)(v.ptr).Len
+	case Ptr:
+		if v.typ.Elem().Kind() == Array {
+			return v.typ.Elem().Len()
+		}
+		panic("reflect: call of reflect.Value.Len on ptr to non-array Value")
 	}
 	panic(&ValueError{"reflect.Value.Len", v.kind()})
 }
@@ -2441,12 +2477,17 @@
 // The fmt package treats Values specially. It does not call their String
 // method implicitly but instead prints the concrete values they hold.
 func (v Value) String() string {
-	switch k := v.kind(); k {
-	case Invalid:
-		return "<invalid Value>"
-	case String:
+	// stringNonString is split out to keep String inlineable for string kinds.
+	if v.kind() == String {
 		return *(*string)(v.ptr)
 	}
+	return v.stringNonString()
+}
+
+func (v Value) stringNonString() string {
+	if v.kind() == Invalid {
+		return "<invalid Value>"
+	}
 	// If you call String on a reflect.Value of other type, it's better to
 	// print something than to panic. Useful in debugging.
 	return "<" + v.Type().String() + " Value>"
@@ -3056,9 +3097,10 @@
 	return Value{t.ptrTo(), p, fl}
 }
 
-// assignTo returns a value v that can be assigned directly to typ.
-// It panics if v is not assignable to typ.
-// For a conversion to an interface type, target is a suggested scratch space to use.
+// assignTo returns a value v that can be assigned directly to dst.
+// It panics if v is not assignable to dst.
+// For a conversion to an interface type, target, if not nil,
+// is a suggested scratch space to use.
 // target must be initialized memory (or nil).
 func (v Value) assignTo(context string, dst *rtype, target unsafe.Pointer) Value {
 	if v.flag&flagMethod != 0 {
@@ -3074,9 +3116,6 @@
 		return Value{dst, v.ptr, fl}
 
 	case implements(dst, v.typ):
-		if target == nil {
-			target = unsafe_New(dst)
-		}
 		if v.Kind() == Interface && v.IsNil() {
 			// A nil ReadWriter passed to nil Reader is OK,
 			// but using ifaceE2I below will panic.
@@ -3084,6 +3123,9 @@
 			return Value{dst, nil, flag(Interface)}
 		}
 		x := valueInterface(v, false)
+		if target == nil {
+			target = unsafe_New(dst)
+		}
 		if dst.NumMethod() == 0 {
 			*(*any)(target) = x
 		} else {

diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go
index cfb703d..350f297 100644
--- a/src/regexp/syntax/parse.go
+++ b/src/regexp/syntax/parse.go

@@ -43,7 +43,7 @@
 	ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
 	ErrTrailingBackslash     ErrorCode = "trailing backslash at end of expression"
 	ErrUnexpectedParen       ErrorCode = "unexpected )"
-	ErrInvalidDepth          ErrorCode = "invalid nesting depth"
+	ErrNestingDepth          ErrorCode = "expression nests too deeply"
 )
 
 func (e ErrorCode) String() string {
@@ -134,7 +134,7 @@
 		}
 	}
 	if p.calcHeight(re, true) > maxHeight {
-		panic(ErrInvalidDepth)
+		panic(ErrNestingDepth)
 	}
 }
 
@@ -762,8 +762,8 @@
 			panic(r)
 		case nil:
 			// ok
-		case ErrInvalidDepth:
-			err = &Error{Code: ErrInvalidDepth, Expr: s}
+		case ErrNestingDepth:
+			err = &Error{Code: ErrNestingDepth, Expr: s}
 		}
 	}()
 

diff --git a/src/runtime/HACKING.md b/src/runtime/HACKING.md
index fbf22ee..6175524 100644
--- a/src/runtime/HACKING.md
+++ b/src/runtime/HACKING.md

@@ -277,6 +277,25 @@
 Since these are function-level annotations, code that releases or
 acquires a P may need to be split across two functions.
 
+go:uintptrkeepalive
+-------------------
+
+The //go:uintptrkeepalive directive must be followed by a function declaration.
+
+It specifies that the function's uintptr arguments may be pointer values that
+have been converted to uintptr and must be kept alive for the duration of the
+call, even though from the types alone it would appear that the object is no
+longer needed during the call.
+
+This directive is similar to //go:uintptrescapes, but it does not force
+arguments to escape. Since stack growth does not understand these arguments,
+this directive must be used with //go:nosplit (in the marked function and all
+transitive calls) to prevent stack growth.
+
+The conversion from pointer to uintptr must appear in the argument list of any
+call to this function. This directive is used for some low-level system call
+implementations.
+
 go:notinheap
 ------------
 

diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 9ef7346..956910f 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s

@@ -96,6 +96,10 @@
 	// start this M
 	BL	runtime·mstart(SB)
 
+	// Prevent dead-code elimination of debugCallV2, which is
+	// intended to be called by debuggers.
+	MOVD	$runtime·debugCallV2<ABIInternal>(SB), R0
+
 	MOVD	$0, R0
 	MOVD	R0, (R0)	// boom
 	UNDEF
@@ -1240,6 +1244,200 @@
 	LDP	21*8(RSP), (R25, R26)
 	JMP	ret
 
+DATA	debugCallFrameTooLarge<>+0x00(SB)/20, $"call frame too large"
+GLOBL	debugCallFrameTooLarge<>(SB), RODATA, $20	// Size duplicated below
+
+// debugCallV2 is the entry point for debugger-injected function
+// calls on running goroutines. It informs the runtime that a
+// debug call has been injected and creates a call frame for the
+// debugger to fill in.
+//
+// To inject a function call, a debugger should:
+// 1. Check that the goroutine is in state _Grunning and that
+//    there are at least 288 bytes free on the stack.
+// 2. Set SP as SP-16.
+// 3. Store the current LR in (SP) (using the SP after step 2).
+// 4. Store the current PC in the LR register.
+// 5. Write the desired argument frame size at SP-16
+// 6. Save all machine registers (including flags and fpsimd reigsters)
+//    so they can be restored later by the debugger.
+// 7. Set the PC to debugCallV2 and resume execution.
+//
+// If the goroutine is in state _Grunnable, then it's not generally
+// safe to inject a call because it may return out via other runtime
+// operations. Instead, the debugger should unwind the stack to find
+// the return to non-runtime code, add a temporary breakpoint there,
+// and inject the call once that breakpoint is hit.
+//
+// If the goroutine is in any other state, it's not safe to inject a call.
+//
+// This function communicates back to the debugger by setting R20 and
+// invoking BRK to raise a breakpoint signal. See the comments in the
+// implementation for the protocol the debugger is expected to
+// follow. InjectDebugCall in the runtime tests demonstrates this protocol.
+//
+// The debugger must ensure that any pointers passed to the function
+// obey escape analysis requirements. Specifically, it must not pass
+// a stack pointer to an escaping argument. debugCallV2 cannot check
+// this invariant.
+//
+// This is ABIInternal because Go code injects its PC directly into new
+// goroutine stacks.
+TEXT runtime·debugCallV2<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
+	STP	(R29, R30), -280(RSP)
+	SUB	$272, RSP, RSP
+	SUB	$8, RSP, R29
+	// Save all registers that may contain pointers so they can be
+	// conservatively scanned.
+	//
+	// We can't do anything that might clobber any of these
+	// registers before this.
+	STP	(R27, g), (30*8)(RSP)
+	STP	(R25, R26), (28*8)(RSP)
+	STP	(R23, R24), (26*8)(RSP)
+	STP	(R21, R22), (24*8)(RSP)
+	STP	(R19, R20), (22*8)(RSP)
+	STP	(R16, R17), (20*8)(RSP)
+	STP	(R14, R15), (18*8)(RSP)
+	STP	(R12, R13), (16*8)(RSP)
+	STP	(R10, R11), (14*8)(RSP)
+	STP	(R8, R9), (12*8)(RSP)
+	STP	(R6, R7), (10*8)(RSP)
+	STP	(R4, R5), (8*8)(RSP)
+	STP	(R2, R3), (6*8)(RSP)
+	STP	(R0, R1), (4*8)(RSP)
+
+	// Perform a safe-point check.
+	MOVD	R30, 8(RSP) // Caller's PC
+	CALL	runtime·debugCallCheck(SB)
+	MOVD	16(RSP), R0
+	CBZ	R0, good
+
+	// The safety check failed. Put the reason string at the top
+	// of the stack.
+	MOVD	R0, 8(RSP)
+	MOVD	24(RSP), R0
+	MOVD	R0, 16(RSP)
+
+	// Set R20 to 8 and invoke BRK. The debugger should get the
+	// reason a call can't be injected from SP+8 and resume execution.
+	MOVD	$8, R20
+	BRK
+	JMP	restore
+
+good:
+	// Registers are saved and it's safe to make a call.
+	// Open up a call frame, moving the stack if necessary.
+	//
+	// Once the frame is allocated, this will set R20 to 0 and
+	// invoke BRK. The debugger should write the argument
+	// frame for the call at SP+8, set up argument registers,
+	// set the lr as the signal PC + 4, set the PC to the function
+	// to call, set R26 to point to the closure (if a closure call),
+	// and resume execution.
+	//
+	// If the function returns, this will set R20 to 1 and invoke
+	// BRK. The debugger can then inspect any return value saved
+	// on the stack at SP+8 and in registers and resume execution again.
+	//
+	// If the function panics, this will set R20 to 2 and invoke BRK.
+	// The interface{} value of the panic will be at SP+8. The debugger
+	// can inspect the panic value and resume execution again.
+#define DEBUG_CALL_DISPATCH(NAME,MAXSIZE)	\
+	CMP	$MAXSIZE, R0;			\
+	BGT	5(PC);				\
+	MOVD	$NAME(SB), R0;			\
+	MOVD	R0, 8(RSP);			\
+	CALL	runtime·debugCallWrap(SB);	\
+	JMP	restore
+
+	MOVD	256(RSP), R0 // the argument frame size
+	DEBUG_CALL_DISPATCH(debugCall32<>, 32)
+	DEBUG_CALL_DISPATCH(debugCall64<>, 64)
+	DEBUG_CALL_DISPATCH(debugCall128<>, 128)
+	DEBUG_CALL_DISPATCH(debugCall256<>, 256)
+	DEBUG_CALL_DISPATCH(debugCall512<>, 512)
+	DEBUG_CALL_DISPATCH(debugCall1024<>, 1024)
+	DEBUG_CALL_DISPATCH(debugCall2048<>, 2048)
+	DEBUG_CALL_DISPATCH(debugCall4096<>, 4096)
+	DEBUG_CALL_DISPATCH(debugCall8192<>, 8192)
+	DEBUG_CALL_DISPATCH(debugCall16384<>, 16384)
+	DEBUG_CALL_DISPATCH(debugCall32768<>, 32768)
+	DEBUG_CALL_DISPATCH(debugCall65536<>, 65536)
+	// The frame size is too large. Report the error.
+	MOVD	$debugCallFrameTooLarge<>(SB), R0
+	MOVD	R0, 8(RSP)
+	MOVD	$20, R0
+	MOVD	R0, 16(RSP) // length of debugCallFrameTooLarge string
+	MOVD	$8, R20
+	BRK
+	JMP	restore
+
+restore:
+	// Calls and failures resume here.
+	//
+	// Set R20 to 16 and invoke BRK. The debugger should restore
+	// all registers except for PC and RSP and resume execution.
+	MOVD	$16, R20
+	BRK
+	// We must not modify flags after this point.
+
+	// Restore pointer-containing registers, which may have been
+	// modified from the debugger's copy by stack copying.
+	LDP	(30*8)(RSP), (R27, g)
+	LDP	(28*8)(RSP), (R25, R26)
+	LDP	(26*8)(RSP), (R23, R24)
+	LDP	(24*8)(RSP), (R21, R22)
+	LDP	(22*8)(RSP), (R19, R20)
+	LDP	(20*8)(RSP), (R16, R17)
+	LDP	(18*8)(RSP), (R14, R15)
+	LDP	(16*8)(RSP), (R12, R13)
+	LDP	(14*8)(RSP), (R10, R11)
+	LDP	(12*8)(RSP), (R8, R9)
+	LDP	(10*8)(RSP), (R6, R7)
+	LDP	(8*8)(RSP), (R4, R5)
+	LDP	(6*8)(RSP), (R2, R3)
+	LDP	(4*8)(RSP), (R0, R1)
+
+	LDP	-8(RSP), (R29, R27)
+	ADD	$288, RSP, RSP // Add 16 more bytes, see saveSigContext
+	MOVD	-16(RSP), R30 // restore old lr
+	JMP	(R27)
+
+// runtime.debugCallCheck assumes that functions defined with the
+// DEBUG_CALL_FN macro are safe points to inject calls.
+#define DEBUG_CALL_FN(NAME,MAXSIZE)		\
+TEXT NAME(SB),WRAPPER,$MAXSIZE-0;		\
+	NO_LOCAL_POINTERS;		\
+	MOVD	$0, R20;		\
+	BRK;		\
+	MOVD	$1, R20;		\
+	BRK;		\
+	RET
+DEBUG_CALL_FN(debugCall32<>, 32)
+DEBUG_CALL_FN(debugCall64<>, 64)
+DEBUG_CALL_FN(debugCall128<>, 128)
+DEBUG_CALL_FN(debugCall256<>, 256)
+DEBUG_CALL_FN(debugCall512<>, 512)
+DEBUG_CALL_FN(debugCall1024<>, 1024)
+DEBUG_CALL_FN(debugCall2048<>, 2048)
+DEBUG_CALL_FN(debugCall4096<>, 4096)
+DEBUG_CALL_FN(debugCall8192<>, 8192)
+DEBUG_CALL_FN(debugCall16384<>, 16384)
+DEBUG_CALL_FN(debugCall32768<>, 32768)
+DEBUG_CALL_FN(debugCall65536<>, 65536)
+
+// func debugCallPanicked(val interface{})
+TEXT runtime·debugCallPanicked(SB),NOSPLIT,$16-16
+	// Copy the panic value to the top of stack at SP+8.
+	MOVD	val_type+0(FP), R0
+	MOVD	R0, 8(RSP)
+	MOVD	val_data+8(FP), R0
+	MOVD	R0, 16(RSP)
+	MOVD	$2, R20
+	BRK
+	RET
+
 // Note: these functions use a special calling convention to save generated code space.
 // Arguments are passed in registers, but the space for those arguments are allocated
 // in the caller's stack frame. These stubs write the args into that stack space and

diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
index 9052649..00caa9f 100644
--- a/src/runtime/asm_riscv64.s
+++ b/src/runtime/asm_riscv64.s

@@ -213,14 +213,14 @@
 	JMP	runtime·morestack(SB)
 
 // AES hashing not implemented for riscv64
-TEXT runtime·memhash(SB),NOSPLIT|NOFRAME,$0-32
-	JMP	runtime·memhashFallback(SB)
-TEXT runtime·strhash(SB),NOSPLIT|NOFRAME,$0-24
-	JMP	runtime·strhashFallback(SB)
-TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-24
-	JMP	runtime·memhash32Fallback(SB)
-TEXT runtime·memhash64(SB),NOSPLIT|NOFRAME,$0-24
-	JMP	runtime·memhash64Fallback(SB)
+TEXT runtime·memhash<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
+	JMP	runtime·memhashFallback<ABIInternal>(SB)
+TEXT runtime·strhash<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·strhashFallback<ABIInternal>(SB)
+TEXT runtime·memhash32<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·memhash32Fallback<ABIInternal>(SB)
+TEXT runtime·memhash64<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·memhash64Fallback<ABIInternal>(SB)
 
 // func return0()
 TEXT runtime·return0(SB), NOSPLIT, $0

diff --git a/src/runtime/cgo/cgo.go b/src/runtime/cgo/cgo.go
index 4d2caf6..8d72d79 100644
--- a/src/runtime/cgo/cgo.go
+++ b/src/runtime/cgo/cgo.go

@@ -23,9 +23,6 @@
 #cgo solaris LDFLAGS: -lxnet
 #cgo illumos LDFLAGS: -lsocket
 
-// Issue 35247.
-#cgo darwin CFLAGS: -Wno-nullability-completeness
-
 #cgo CFLAGS: -Wall -Werror
 
 #cgo solaris CPPFLAGS: -D_POSIX_PTHREAD_SEMANTICS

diff --git a/src/runtime/chan_test.go b/src/runtime/chan_test.go
index a8627e9..256f976 100644
--- a/src/runtime/chan_test.go
+++ b/src/runtime/chan_test.go

@@ -629,6 +629,9 @@
 	if runtime.GOOS == "netbsd" && runtime.GOARCH == "arm64" {
 		testenv.SkipFlaky(t, 49382)
 	}
+	if runtime.GOOS == "openbsd" {
+		testenv.SkipFlaky(t, 51482)
+	}
 
 	// The goal of this test is to trigger a "racy sudog adjustment"
 	// throw. Basically, there's a window between when a goroutine

diff --git a/src/runtime/debug_test.go b/src/runtime/debug_test.go
index 7698eac..75fe07e 100644
--- a/src/runtime/debug_test.go
+++ b/src/runtime/debug_test.go

@@ -9,7 +9,7 @@
 // spends all of its time in the race runtime, which isn't a safe
 // point.
 
-//go:build amd64 && linux && !race
+//go:build (amd64 || arm64) && linux && !race
 
 package runtime_test
 

diff --git a/src/runtime/debugcall.go b/src/runtime/debugcall.go
index 205971c..2f164e7 100644
--- a/src/runtime/debugcall.go
+++ b/src/runtime/debugcall.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build amd64
+//go:build amd64 || arm64
 
 package runtime
 

diff --git a/src/runtime/export_debug_amd64_test.go b/src/runtime/export_debug_amd64_test.go
new file mode 100644
index 0000000..f9908cd
--- /dev/null
+++ b/src/runtime/export_debug_amd64_test.go

@@ -0,0 +1,132 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 && linux
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"unsafe"
+)
+
+type sigContext struct {
+	savedRegs sigcontext
+	// sigcontext.fpstate is a pointer, so we need to save
+	// the its value with a fpstate1 structure.
+	savedFP fpstate1
+}
+
+func sigctxtSetContextRegister(ctxt *sigctxt, x uint64) {
+	ctxt.regs().rdx = x
+}
+
+func sigctxtAtTrapInstruction(ctxt *sigctxt) bool {
+	return *(*byte)(unsafe.Pointer(uintptr(ctxt.rip() - 1))) == 0xcc // INT 3
+}
+
+func sigctxtStatus(ctxt *sigctxt) uint64 {
+	return ctxt.r12()
+}
+
+func (h *debugCallHandler) saveSigContext(ctxt *sigctxt) {
+	// Push current PC on the stack.
+	rsp := ctxt.rsp() - goarch.PtrSize
+	*(*uint64)(unsafe.Pointer(uintptr(rsp))) = ctxt.rip()
+	ctxt.set_rsp(rsp)
+	// Write the argument frame size.
+	*(*uintptr)(unsafe.Pointer(uintptr(rsp - 16))) = h.argSize
+	// Save current registers.
+	h.sigCtxt.savedRegs = *ctxt.regs()
+	h.sigCtxt.savedFP = *h.sigCtxt.savedRegs.fpstate
+	h.sigCtxt.savedRegs.fpstate = nil
+}
+
+// case 0
+func (h *debugCallHandler) debugCallRun(ctxt *sigctxt) {
+	rsp := ctxt.rsp()
+	memmove(unsafe.Pointer(uintptr(rsp)), h.argp, h.argSize)
+	if h.regArgs != nil {
+		storeRegArgs(ctxt.regs(), h.regArgs)
+	}
+	// Push return PC.
+	rsp -= goarch.PtrSize
+	ctxt.set_rsp(rsp)
+	// The signal PC is the next PC of the trap instruction.
+	*(*uint64)(unsafe.Pointer(uintptr(rsp))) = ctxt.rip()
+	// Set PC to call and context register.
+	ctxt.set_rip(uint64(h.fv.fn))
+	sigctxtSetContextRegister(ctxt, uint64(uintptr(unsafe.Pointer(h.fv))))
+}
+
+// case 1
+func (h *debugCallHandler) debugCallReturn(ctxt *sigctxt) {
+	rsp := ctxt.rsp()
+	memmove(h.argp, unsafe.Pointer(uintptr(rsp)), h.argSize)
+	if h.regArgs != nil {
+		loadRegArgs(h.regArgs, ctxt.regs())
+	}
+}
+
+// case 2
+func (h *debugCallHandler) debugCallPanicOut(ctxt *sigctxt) {
+	rsp := ctxt.rsp()
+	memmove(unsafe.Pointer(&h.panic), unsafe.Pointer(uintptr(rsp)), 2*goarch.PtrSize)
+}
+
+// case 8
+func (h *debugCallHandler) debugCallUnsafe(ctxt *sigctxt) {
+	rsp := ctxt.rsp()
+	reason := *(*string)(unsafe.Pointer(uintptr(rsp)))
+	h.err = plainError(reason)
+}
+
+// case 16
+func (h *debugCallHandler) restoreSigContext(ctxt *sigctxt) {
+	// Restore all registers except RIP and RSP.
+	rip, rsp := ctxt.rip(), ctxt.rsp()
+	fp := ctxt.regs().fpstate
+	*ctxt.regs() = h.sigCtxt.savedRegs
+	ctxt.regs().fpstate = fp
+	*fp = h.sigCtxt.savedFP
+	ctxt.set_rip(rip)
+	ctxt.set_rsp(rsp)
+}
+
+// storeRegArgs sets up argument registers in the signal
+// context state from an abi.RegArgs.
+//
+// Both src and dst must be non-nil.
+func storeRegArgs(dst *sigcontext, src *abi.RegArgs) {
+	dst.rax = uint64(src.Ints[0])
+	dst.rbx = uint64(src.Ints[1])
+	dst.rcx = uint64(src.Ints[2])
+	dst.rdi = uint64(src.Ints[3])
+	dst.rsi = uint64(src.Ints[4])
+	dst.r8 = uint64(src.Ints[5])
+	dst.r9 = uint64(src.Ints[6])
+	dst.r10 = uint64(src.Ints[7])
+	dst.r11 = uint64(src.Ints[8])
+	for i := range src.Floats {
+		dst.fpstate._xmm[i].element[0] = uint32(src.Floats[i] >> 0)
+		dst.fpstate._xmm[i].element[1] = uint32(src.Floats[i] >> 32)
+	}
+}
+
+func loadRegArgs(dst *abi.RegArgs, src *sigcontext) {
+	dst.Ints[0] = uintptr(src.rax)
+	dst.Ints[1] = uintptr(src.rbx)
+	dst.Ints[2] = uintptr(src.rcx)
+	dst.Ints[3] = uintptr(src.rdi)
+	dst.Ints[4] = uintptr(src.rsi)
+	dst.Ints[5] = uintptr(src.r8)
+	dst.Ints[6] = uintptr(src.r9)
+	dst.Ints[7] = uintptr(src.r10)
+	dst.Ints[8] = uintptr(src.r11)
+	for i := range dst.Floats {
+		dst.Floats[i] = uint64(src.fpstate._xmm[i].element[0]) << 0
+		dst.Floats[i] |= uint64(src.fpstate._xmm[i].element[1]) << 32
+	}
+}

diff --git a/src/runtime/export_debug_arm64_test.go b/src/runtime/export_debug_arm64_test.go
new file mode 100644
index 0000000..ee90241
--- /dev/null
+++ b/src/runtime/export_debug_arm64_test.go

@@ -0,0 +1,135 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build arm64 && linux
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"unsafe"
+)
+
+type sigContext struct {
+	savedRegs sigcontext
+}
+
+func sigctxtSetContextRegister(ctxt *sigctxt, x uint64) {
+	ctxt.regs().regs[26] = x
+}
+
+func sigctxtAtTrapInstruction(ctxt *sigctxt) bool {
+	return *(*uint32)(unsafe.Pointer(ctxt.sigpc())) == 0xd4200000 // BRK 0
+}
+
+func sigctxtStatus(ctxt *sigctxt) uint64 {
+	return ctxt.r20()
+}
+
+func (h *debugCallHandler) saveSigContext(ctxt *sigctxt) {
+	sp := ctxt.sp()
+	sp -= 2 * goarch.PtrSize
+	ctxt.set_sp(sp)
+	*(*uint64)(unsafe.Pointer(uintptr(sp))) = ctxt.lr() // save the current lr
+	ctxt.set_lr(ctxt.pc())                              // set new lr to the current pc
+	// Write the argument frame size.
+	*(*uintptr)(unsafe.Pointer(uintptr(sp - 16))) = h.argSize
+	// Save current registers.
+	h.sigCtxt.savedRegs = *ctxt.regs()
+}
+
+// case 0
+func (h *debugCallHandler) debugCallRun(ctxt *sigctxt) {
+	sp := ctxt.sp()
+	memmove(unsafe.Pointer(uintptr(sp)+8), h.argp, h.argSize)
+	if h.regArgs != nil {
+		storeRegArgs(ctxt.regs(), h.regArgs)
+	}
+	// Push return PC, which should be the signal PC+4, because
+	// the signal PC is the PC of the trap instruction itself.
+	ctxt.set_lr(ctxt.pc() + 4)
+	// Set PC to call and context register.
+	ctxt.set_pc(uint64(h.fv.fn))
+	sigctxtSetContextRegister(ctxt, uint64(uintptr(unsafe.Pointer(h.fv))))
+}
+
+// case 1
+func (h *debugCallHandler) debugCallReturn(ctxt *sigctxt) {
+	sp := ctxt.sp()
+	memmove(h.argp, unsafe.Pointer(uintptr(sp)+8), h.argSize)
+	if h.regArgs != nil {
+		loadRegArgs(h.regArgs, ctxt.regs())
+	}
+	// Restore the old lr from *sp
+	olr := *(*uint64)(unsafe.Pointer(uintptr(sp)))
+	ctxt.set_lr(olr)
+	pc := ctxt.pc()
+	ctxt.set_pc(pc + 4) // step to next instruction
+}
+
+// case 2
+func (h *debugCallHandler) debugCallPanicOut(ctxt *sigctxt) {
+	sp := ctxt.sp()
+	memmove(unsafe.Pointer(&h.panic), unsafe.Pointer(uintptr(sp)+8), 2*goarch.PtrSize)
+	ctxt.set_pc(ctxt.pc() + 4)
+}
+
+// case 8
+func (h *debugCallHandler) debugCallUnsafe(ctxt *sigctxt) {
+	sp := ctxt.sp()
+	reason := *(*string)(unsafe.Pointer(uintptr(sp) + 8))
+	h.err = plainError(reason)
+	ctxt.set_pc(ctxt.pc() + 4)
+}
+
+// case 16
+func (h *debugCallHandler) restoreSigContext(ctxt *sigctxt) {
+	// Restore all registers except for pc and sp
+	pc, sp := ctxt.pc(), ctxt.sp()
+	*ctxt.regs() = h.sigCtxt.savedRegs
+	ctxt.set_pc(pc + 4)
+	ctxt.set_sp(sp)
+}
+
+// storeRegArgs sets up argument registers in the signal
+// context state from an abi.RegArgs.
+//
+// Both src and dst must be non-nil.
+func storeRegArgs(dst *sigcontext, src *abi.RegArgs) {
+	for i, r := range src.Ints {
+		dst.regs[i] = uint64(r)
+	}
+	for i, r := range src.Floats {
+		*(fpRegAddr(dst, i)) = r
+	}
+}
+
+func loadRegArgs(dst *abi.RegArgs, src *sigcontext) {
+	for i := range dst.Ints {
+		dst.Ints[i] = uintptr(src.regs[i])
+	}
+	for i := range dst.Floats {
+		dst.Floats[i] = *(fpRegAddr(src, i))
+	}
+}
+
+// fpRegAddr returns the address of the ith fp-simd register in sigcontext.
+func fpRegAddr(dst *sigcontext, i int) *uint64 {
+	/* FP-SIMD registers are saved in sigcontext.__reserved, which is orgnized in
+	the following C structs:
+	struct fpsimd_context {
+		struct _aarch64_ctx head;
+		__u32 fpsr;
+		__u32 fpcr;
+		__uint128_t vregs[32];
+	};
+	struct _aarch64_ctx {
+		__u32 magic;
+		__u32 size;
+	};
+	So the offset of the ith FP_SIMD register is 16+i*128.
+	*/
+	return (*uint64)(unsafe.Pointer(&dst.__reserved[16+i*128]))
+}

diff --git a/src/runtime/export_debug_regabiargs_off_test.go b/src/runtime/export_debug_regabiargs_off_test.go
deleted file mode 100644
index 81f7392..0000000
--- a/src/runtime/export_debug_regabiargs_off_test.go
+++ /dev/null

@@ -1,15 +0,0 @@
-// Copyright 2021 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build amd64 && linux && !goexperiment.regabiargs
-
-package runtime
-
-import "internal/abi"
-
-func storeRegArgs(dst *sigcontext, src *abi.RegArgs) {
-}
-
-func loadRegArgs(dst *abi.RegArgs, src *sigcontext) {
-}

diff --git a/src/runtime/export_debug_regabiargs_on_test.go b/src/runtime/export_debug_regabiargs_on_test.go
deleted file mode 100644
index 7d1ab68..0000000
--- a/src/runtime/export_debug_regabiargs_on_test.go
+++ /dev/null

@@ -1,45 +0,0 @@
-// Copyright 2021 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build amd64 && linux && goexperiment.regabiargs
-
-package runtime
-
-import "internal/abi"
-
-// storeRegArgs sets up argument registers in the signal
-// context state from an abi.RegArgs.
-//
-// Both src and dst must be non-nil.
-func storeRegArgs(dst *sigcontext, src *abi.RegArgs) {
-	dst.rax = uint64(src.Ints[0])
-	dst.rbx = uint64(src.Ints[1])
-	dst.rcx = uint64(src.Ints[2])
-	dst.rdi = uint64(src.Ints[3])
-	dst.rsi = uint64(src.Ints[4])
-	dst.r8 = uint64(src.Ints[5])
-	dst.r9 = uint64(src.Ints[6])
-	dst.r10 = uint64(src.Ints[7])
-	dst.r11 = uint64(src.Ints[8])
-	for i := range src.Floats {
-		dst.fpstate._xmm[i].element[0] = uint32(src.Floats[i] >> 0)
-		dst.fpstate._xmm[i].element[1] = uint32(src.Floats[i] >> 32)
-	}
-}
-
-func loadRegArgs(dst *abi.RegArgs, src *sigcontext) {
-	dst.Ints[0] = uintptr(src.rax)
-	dst.Ints[1] = uintptr(src.rbx)
-	dst.Ints[2] = uintptr(src.rcx)
-	dst.Ints[3] = uintptr(src.rdi)
-	dst.Ints[4] = uintptr(src.rsi)
-	dst.Ints[5] = uintptr(src.r8)
-	dst.Ints[6] = uintptr(src.r9)
-	dst.Ints[7] = uintptr(src.r10)
-	dst.Ints[8] = uintptr(src.r11)
-	for i := range dst.Floats {
-		dst.Floats[i] = uint64(src.fpstate._xmm[i].element[0]) << 0
-		dst.Floats[i] |= uint64(src.fpstate._xmm[i].element[1]) << 32
-	}
-}

diff --git a/src/runtime/export_debug_test.go b/src/runtime/export_debug_test.go
index 19a9ec1..09e9779 100644
--- a/src/runtime/export_debug_test.go
+++ b/src/runtime/export_debug_test.go

@@ -2,13 +2,12 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build amd64 && linux
+//go:build (amd64 || arm64) && linux
 
 package runtime
 
 import (
 	"internal/abi"
-	"internal/goarch"
 	"unsafe"
 )
 
@@ -100,10 +99,9 @@
 
 	handleF func(info *siginfo, ctxt *sigctxt, gp2 *g) bool
 
-	err       plainError
-	done      note
-	savedRegs sigcontext
-	savedFP   fpstate1
+	err     plainError
+	done    note
+	sigCtxt sigContext
 }
 
 func (h *debugCallHandler) inject(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
@@ -117,18 +115,10 @@
 			println("trap on wrong M", getg().m, h.mp)
 			return false
 		}
-		// Push current PC on the stack.
-		rsp := ctxt.rsp() - goarch.PtrSize
-		*(*uint64)(unsafe.Pointer(uintptr(rsp))) = ctxt.rip()
-		ctxt.set_rsp(rsp)
-		// Write the argument frame size.
-		*(*uintptr)(unsafe.Pointer(uintptr(rsp - 16))) = h.argSize
-		// Save current registers.
-		h.savedRegs = *ctxt.regs()
-		h.savedFP = *h.savedRegs.fpstate
-		h.savedRegs.fpstate = nil
+		// Save the signal context
+		h.saveSigContext(ctxt)
 		// Set PC to debugCallV2.
-		ctxt.set_rip(uint64(abi.FuncPCABIInternal(debugCallV2)))
+		ctxt.setsigpc(uint64(abi.FuncPCABIInternal(debugCallV2)))
 		// Call injected. Switch to the debugCall protocol.
 		testSigtrap = h.handleF
 	case _Grunnable:
@@ -154,57 +144,33 @@
 		println("trap on wrong M", getg().m, h.mp)
 		return false
 	}
-	f := findfunc(uintptr(ctxt.rip()))
+	f := findfunc(ctxt.sigpc())
 	if !(hasPrefix(funcname(f), "runtime.debugCall") || hasPrefix(funcname(f), "debugCall")) {
 		println("trap in unknown function", funcname(f))
 		return false
 	}
-	if *(*byte)(unsafe.Pointer(uintptr(ctxt.rip() - 1))) != 0xcc {
-		println("trap at non-INT3 instruction pc =", hex(ctxt.rip()))
+	if !sigctxtAtTrapInstruction(ctxt) {
+		println("trap at non-INT3 instruction pc =", hex(ctxt.sigpc()))
 		return false
 	}
 
-	switch status := ctxt.r12(); status {
+	switch status := sigctxtStatus(ctxt); status {
 	case 0:
 		// Frame is ready. Copy the arguments to the frame and to registers.
-		sp := ctxt.rsp()
-		memmove(unsafe.Pointer(uintptr(sp)), h.argp, h.argSize)
-		if h.regArgs != nil {
-			storeRegArgs(ctxt.regs(), h.regArgs)
-		}
-		// Push return PC.
-		sp -= goarch.PtrSize
-		ctxt.set_rsp(sp)
-		*(*uint64)(unsafe.Pointer(uintptr(sp))) = ctxt.rip()
-		// Set PC to call and context register.
-		ctxt.set_rip(uint64(h.fv.fn))
-		ctxt.regs().rdx = uint64(uintptr(unsafe.Pointer(h.fv)))
+		// Call the debug function.
+		h.debugCallRun(ctxt)
 	case 1:
 		// Function returned. Copy frame and result registers back out.
-		sp := ctxt.rsp()
-		memmove(h.argp, unsafe.Pointer(uintptr(sp)), h.argSize)
-		if h.regArgs != nil {
-			loadRegArgs(h.regArgs, ctxt.regs())
-		}
+		h.debugCallReturn(ctxt)
 	case 2:
 		// Function panicked. Copy panic out.
-		sp := ctxt.rsp()
-		memmove(unsafe.Pointer(&h.panic), unsafe.Pointer(uintptr(sp)), 2*goarch.PtrSize)
+		h.debugCallPanicOut(ctxt)
 	case 8:
 		// Call isn't safe. Get the reason.
-		sp := ctxt.rsp()
-		reason := *(*string)(unsafe.Pointer(uintptr(sp)))
-		h.err = plainError(reason)
+		h.debugCallUnsafe(ctxt)
 		// Don't wake h.done. We need to transition to status 16 first.
 	case 16:
-		// Restore all registers except RIP and RSP.
-		rip, rsp := ctxt.rip(), ctxt.rsp()
-		fp := ctxt.regs().fpstate
-		*ctxt.regs() = h.savedRegs
-		ctxt.regs().fpstate = fp
-		*fp = h.savedFP
-		ctxt.set_rip(rip)
-		ctxt.set_rsp(rsp)
+		h.restoreSigContext(ctxt)
 		// Done
 		notewakeup(&h.done)
 	default:

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index af27050..3916eaf 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go

@@ -85,6 +85,7 @@
 func RunSchedLocalQueueTest() {
 	_p_ := new(p)
 	gs := make([]g, len(_p_.runq))
+	escape(gs) // Ensure gs doesn't move, since we use guintptrs
 	for i := 0; i < len(_p_.runq); i++ {
 		if g, _ := runqget(_p_); g != nil {
 			throw("runq is not empty initially")
@@ -108,6 +109,7 @@
 	p1 := new(p)
 	p2 := new(p)
 	gs := make([]g, len(p1.runq))
+	escape(gs) // Ensure gs doesn't move, since we use guintptrs
 	for i := 0; i < len(p1.runq); i++ {
 		for j := 0; j < i; j++ {
 			gs[j].sig = 0
@@ -155,6 +157,7 @@
 	done := make(chan bool, 1)
 	p := new(p)
 	gs := make([]g, 2)
+	escape(gs) // Ensure gs doesn't move, since we use guintptrs
 	ready := new(uint32)
 	for i := 0; i < iters; i++ {
 		*ready = 0
@@ -277,6 +280,7 @@
 }
 
 func Fastrand() uint32          { return fastrand() }
+func Fastrand64() uint64        { return fastrand64() }
 func Fastrandn(n uint32) uint32 { return fastrandn(n) }
 
 type ProfBuf profBuf
@@ -1256,7 +1260,7 @@
 	// do 64-bit atomics on it, and if it gets stack-allocated
 	// on a 32-bit architecture, it may get allocated unaligned
 	// space.
-	g := escape(new(GCController)).(*GCController)
+	g := escape(new(GCController))
 	g.gcControllerState.test = true // Mark it as a test copy.
 	g.init(int32(gcPercent))
 	return g
@@ -1267,7 +1271,7 @@
 	c.globalsScan = globalsSize
 	c.heapLive = c.trigger
 	c.heapScan += uint64(float64(c.trigger-c.heapMarked) * scannableFrac)
-	c.startCycle(0, gomaxprocs)
+	c.startCycle(0, gomaxprocs, gcTrigger{kind: gcTriggerHeap})
 }
 
 func (c *GCController) AssistWorkPerByte() float64 {
@@ -1314,10 +1318,27 @@
 	c.commit()
 }
 
+func (c *GCController) AddIdleMarkWorker() bool {
+	return c.addIdleMarkWorker()
+}
+
+func (c *GCController) NeedIdleMarkWorker() bool {
+	return c.needIdleMarkWorker()
+}
+
+func (c *GCController) RemoveIdleMarkWorker() {
+	c.removeIdleMarkWorker()
+}
+
+func (c *GCController) SetMaxIdleMarkWorkers(max int32) {
+	c.setMaxIdleMarkWorkers(max)
+}
+
 var escapeSink any
 
 //go:noinline
-func escape(x any) any {
+//go:norace
+func escape[T any](x T) T {
 	escapeSink = x
 	escapeSink = nil
 	return x
@@ -1351,3 +1372,107 @@
 func (c *PIController) Next(input, setpoint, period float64) (float64, bool) {
 	return c.piController.next(input, setpoint, period)
 }
+
+const ScavengePercent = scavengePercent
+
+type Scavenger struct {
+	Sleep      func(int64) int64
+	Scavenge   func(uintptr) (uintptr, int64)
+	ShouldStop func() bool
+	GoMaxProcs func() int32
+
+	released  atomic.Uintptr
+	scavenger scavengerState
+	stop      chan<- struct{}
+	done      <-chan struct{}
+}
+
+func (s *Scavenger) Start() {
+	if s.Sleep == nil || s.Scavenge == nil || s.ShouldStop == nil || s.GoMaxProcs == nil {
+		panic("must populate all stubs")
+	}
+
+	// Install hooks.
+	s.scavenger.sleepStub = s.Sleep
+	s.scavenger.scavenge = s.Scavenge
+	s.scavenger.shouldStop = s.ShouldStop
+	s.scavenger.gomaxprocs = s.GoMaxProcs
+
+	// Start up scavenger goroutine, and wait for it to be ready.
+	stop := make(chan struct{})
+	s.stop = stop
+	done := make(chan struct{})
+	s.done = done
+	go func() {
+		// This should match bgscavenge, loosely.
+		s.scavenger.init()
+		s.scavenger.park()
+		for {
+			select {
+			case <-stop:
+				close(done)
+				return
+			default:
+			}
+			released, workTime := s.scavenger.run()
+			if released == 0 {
+				s.scavenger.park()
+				continue
+			}
+			s.released.Add(released)
+			s.scavenger.sleep(workTime)
+		}
+	}()
+	if !s.BlockUntilParked(1e9 /* 1 second */) {
+		panic("timed out waiting for scavenger to get ready")
+	}
+}
+
+// BlockUntilParked blocks until the scavenger parks, or until
+// timeout is exceeded. Returns true if the scavenger parked.
+//
+// Note that in testing, parked means something slightly different.
+// In anger, the scavenger parks to sleep, too, but in testing,
+// it only parks when it actually has no work to do.
+func (s *Scavenger) BlockUntilParked(timeout int64) bool {
+	// Just spin, waiting for it to park.
+	//
+	// The actual parking process is racy with respect to
+	// wakeups, which is fine, but for testing we need something
+	// a bit more robust.
+	start := nanotime()
+	for nanotime()-start < timeout {
+		lock(&s.scavenger.lock)
+		parked := s.scavenger.parked
+		unlock(&s.scavenger.lock)
+		if parked {
+			return true
+		}
+		Gosched()
+	}
+	return false
+}
+
+// Released returns how many bytes the scavenger released.
+func (s *Scavenger) Released() uintptr {
+	return s.released.Load()
+}
+
+// Wake wakes up a parked scavenger to keep running.
+func (s *Scavenger) Wake() {
+	s.scavenger.wake()
+}
+
+// Stop cleans up the scavenger's resources. The scavenger
+// must be parked for this to work.
+func (s *Scavenger) Stop() {
+	lock(&s.scavenger.lock)
+	parked := s.scavenger.parked
+	unlock(&s.scavenger.lock)
+	if !parked {
+		panic("tried to clean up scavenger that is not parked")
+	}
+	close(s.stop)
+	s.Wake()
+	<-s.done
+}

diff --git a/src/runtime/hash_test.go b/src/runtime/hash_test.go
index cf56c57..e726006 100644
--- a/src/runtime/hash_test.go
+++ b/src/runtime/hash_test.go

@@ -525,6 +525,13 @@
 	if GOARCH == "wasm" {
 		t.Skip("Too slow on wasm")
 	}
+	if PtrSize == 4 {
+		// This test tends to be flaky on 32-bit systems.
+		// There's not enough bits in the hash output, so we
+		// expect a nontrivial number of collisions, and it is
+		// often quite a bit higher than expected. See issue 43130.
+		t.Skip("Flaky on 32-bit systems")
+	}
 	if testing.Short() {
 		t.Skip("Skipping in short mode")
 	}

diff --git a/src/runtime/internal/syscall/asm_linux_amd64.s b/src/runtime/internal/syscall/asm_linux_amd64.s
index 961d9bd..3740ef1 100644
--- a/src/runtime/internal/syscall/asm_linux_amd64.s
+++ b/src/runtime/internal/syscall/asm_linux_amd64.s

@@ -6,28 +6,42 @@
 
 // func Syscall6(num, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, errno uintptr)
 //
-// Syscall # in AX, args in DI SI DX R10 R8 R9, return in AX DX.
+// We need to convert to the syscall ABI.
+//
+// arg | ABIInternal | Syscall
+// ---------------------------
+// num | AX          | AX
+// a1  | BX          | DI
+// a2  | CX          | SI
+// a3  | DI          | DX
+// a4  | SI          | R10
+// a5  | R8          | R8
+// a6  | R9          | R9
+//
+// r1  | AX          | AX
+// r2  | BX          | DX
+// err | CX          | part of AX
 //
 // Note that this differs from "standard" ABI convention, which would pass 4th
 // arg in CX, not R10.
-TEXT ·Syscall6(SB),NOSPLIT,$0-80
-	MOVQ	num+0(FP), AX	// syscall entry
-	MOVQ	a1+8(FP), DI
-	MOVQ	a2+16(FP), SI
-	MOVQ	a3+24(FP), DX
-	MOVQ	a4+32(FP), R10
-	MOVQ	a5+40(FP), R8
-	MOVQ	a6+48(FP), R9
+TEXT ·Syscall6<ABIInternal>(SB),NOSPLIT,$0
+	// a6 already in R9.
+	// a5 already in R8.
+	MOVQ	SI, R10 // a4
+	MOVQ	DI, DX  // a3
+	MOVQ	CX, SI  // a2
+	MOVQ	BX, DI  // a1
+	// num already in AX.
 	SYSCALL
 	CMPQ	AX, $0xfffffffffffff001
 	JLS	ok
-	MOVQ	$-1, r1+56(FP)
-	MOVQ	$0, r2+64(FP)
 	NEGQ	AX
-	MOVQ	AX, errno+72(FP)
+	MOVQ	AX, CX  // errno
+	MOVQ	$-1, AX // r1
+	MOVQ	$0, BX  // r2
 	RET
 ok:
-	MOVQ	AX, r1+56(FP)
-	MOVQ	DX, r2+64(FP)
-	MOVQ	$0, errno+72(FP)
+	// r1 already in AX.
+	MOVQ	DX, BX // r2
+	MOVQ	$0, CX // errno
 	RET

diff --git a/src/runtime/internal/syscall/syscall_linux.go b/src/runtime/internal/syscall/syscall_linux.go
index 06d5f21..7f268e8 100644
--- a/src/runtime/internal/syscall/syscall_linux.go
+++ b/src/runtime/internal/syscall/syscall_linux.go

@@ -5,8 +5,35 @@
 // Package syscall provides the syscall primitives required for the runtime.
 package syscall
 
+import (
+	_ "unsafe" // for go:linkname
+)
+
 // TODO(https://go.dev/issue/51087): This package is incomplete and currently
 // only contains very minimal support for Linux.
 
 // Syscall6 calls system call number 'num' with arguments a1-6.
 func Syscall6(num, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, errno uintptr)
+
+// syscall_RawSyscall6 is a push linkname to export Syscall6 as
+// syscall.RawSyscall6.
+//
+// //go:uintptrkeepalive because the uintptr argument may be converted pointers
+// that need to be kept alive in the caller (this is implied for Syscall6 since
+// it has no body).
+//
+// //go:nosplit because stack copying does not account for uintptrkeepalive, so
+// the stack must not grow. Stack copying cannot blindly assume that all
+// uintptr arguments are pointers, because some values may look like pointers,
+// but not really be pointers, and adjusting their value would break the call.
+//
+// This is a separate wrapper because we can't export one function as two
+// names. The assembly implementations name themselves Syscall6 would not be
+// affected by a linkname.
+//
+//go:uintptrkeepalive
+//go:nosplit
+//go:linkname syscall_RawSyscall6 syscall.RawSyscall6
+func syscall_RawSyscall6(num, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, errno uintptr) {
+	return Syscall6(num, a1, a2, a3, a4, a5, a6)
+}

diff --git a/src/runtime/map.go b/src/runtime/map.go
index e91b25e..2e513e2 100644
--- a/src/runtime/map.go
+++ b/src/runtime/map.go

@@ -842,9 +842,11 @@
 	}
 
 	// decide where to start
-	r := uintptr(fastrand())
+	var r uintptr
 	if h.B > 31-bucketCntBits {
-		r += uintptr(fastrand()) << 31
+		r = uintptr(fastrand64())
+	} else {
+		r = uintptr(fastrand())
 	}
 	it.startBucket = r & bucketMask(h.B)
 	it.offset = uint8(r >> h.B & (bucketCnt - 1))

diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 86a8958..afd5afb 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go

@@ -156,6 +156,25 @@
 			throw("bad sweepgen in refill")
 		}
 		mheap_.central[spc].mcentral.uncacheSpan(s)
+
+		// Count up how many slots were used and record it.
+		stats := memstats.heapStats.acquire()
+		slotsUsed := uintptr(s.allocCount) - uintptr(s.allocCountBeforeCache)
+		atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], slotsUsed)
+
+		// Flush tinyAllocs.
+		if spc == tinySpanClass {
+			atomic.Xadduintptr(&stats.tinyAllocCount, c.tinyAllocs)
+			c.tinyAllocs = 0
+		}
+		memstats.heapStats.release()
+
+		// Update heapLive and flush scanAlloc.
+		gcController.update(int64(slotsUsed*s.elemsize), int64(c.scanAlloc))
+		c.scanAlloc = 0
+
+		// Clear the second allocCount just to be safe.
+		s.allocCountBeforeCache = 0
 	}
 
 	// Get a new cached span from the central lists.
@@ -172,24 +191,8 @@
 	// sweeping in the next sweep phase.
 	s.sweepgen = mheap_.sweepgen + 3
 
-	// Assume all objects from this span will be allocated in the
-	// mcache. If it gets uncached, we'll adjust this.
-	stats := memstats.heapStats.acquire()
-	atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], uintptr(s.nelems)-uintptr(s.allocCount))
-
-	// Flush tinyAllocs.
-	if spc == tinySpanClass {
-		atomic.Xadduintptr(&stats.tinyAllocCount, c.tinyAllocs)
-		c.tinyAllocs = 0
-	}
-	memstats.heapStats.release()
-
-	// Update heapLive with the same assumption.
-	// While we're here, flush scanAlloc, since we have to call
-	// revise anyway.
-	usedBytes := uintptr(s.allocCount) * s.elemsize
-	gcController.update(int64(s.npages*pageSize)-int64(usedBytes), int64(c.scanAlloc))
-	c.scanAlloc = 0
+	// Store the current alloc count for accounting later.
+	s.allocCountBeforeCache = s.allocCount
 
 	c.alloc[spc] = s
 }
@@ -235,26 +238,16 @@
 	scanAlloc := int64(c.scanAlloc)
 	c.scanAlloc = 0
 
-	sg := mheap_.sweepgen
-	dHeapLive := int64(0)
 	for i := range c.alloc {
 		s := c.alloc[i]
 		if s != &emptymspan {
-			// Adjust nsmallalloc in case the span wasn't fully allocated.
-			n := uintptr(s.nelems) - uintptr(s.allocCount)
+			// Adjust smallAllocCount for whatever was allocated.
 			stats := memstats.heapStats.acquire()
-			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], -n)
+			slotsUsed := uintptr(s.allocCount) - uintptr(s.allocCountBeforeCache)
+			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], slotsUsed)
 			memstats.heapStats.release()
-			if s.sweepgen != sg+1 {
-				// refill conservatively counted unallocated slots in gcController.heapLive.
-				// Undo this.
-				//
-				// If this span was cached before sweep, then
-				// gcController.heapLive was totally recomputed since
-				// caching this span, so we don't do this for
-				// stale spans.
-				dHeapLive -= int64(n) * int64(s.elemsize)
-			}
+			s.allocCountBeforeCache = 0
+
 			// Release the span to the mcentral.
 			mheap_.central[i].mcentral.uncacheSpan(s)
 			c.alloc[i] = &emptymspan
@@ -270,8 +263,8 @@
 	c.tinyAllocs = 0
 	memstats.heapStats.release()
 
-	// Updated heapScan and heapLive.
-	gcController.update(dHeapLive, scanAlloc)
+	// Updated heapScan.
+	gcController.update(0, scanAlloc)
 }
 
 // prepareForSweep flushes c if the system has entered a new sweep phase

diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
index 5d32ef4..4bd1408 100644
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go

@@ -9,6 +9,7 @@
 	"runtime/metrics"
 	"sort"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 	"unsafe"
@@ -319,3 +320,88 @@
 	b.ReportMetric(float64(latencies[len(latencies)*90/100]), "p90-ns")
 	b.ReportMetric(float64(latencies[len(latencies)*99/100]), "p99-ns")
 }
+
+var readMetricsSink [1024]interface{}
+
+func TestReadMetricsCumulative(t *testing.T) {
+	// Set up the set of metrics marked cumulative.
+	descs := metrics.All()
+	var samples [2][]metrics.Sample
+	samples[0] = make([]metrics.Sample, len(descs))
+	samples[1] = make([]metrics.Sample, len(descs))
+	total := 0
+	for i := range samples[0] {
+		if !descs[i].Cumulative {
+			continue
+		}
+		samples[0][total].Name = descs[i].Name
+		total++
+	}
+	samples[0] = samples[0][:total]
+	samples[1] = samples[1][:total]
+	copy(samples[1], samples[0])
+
+	// Start some noise in the background.
+	var wg sync.WaitGroup
+	wg.Add(1)
+	done := make(chan struct{})
+	go func() {
+		defer wg.Done()
+		for {
+			// Add more things here that could influence metrics.
+			for i := 0; i < len(readMetricsSink); i++ {
+				readMetricsSink[i] = make([]byte, 1024)
+				select {
+				case <-done:
+					return
+				default:
+				}
+			}
+			runtime.GC()
+		}
+	}()
+
+	sum := func(us []uint64) uint64 {
+		total := uint64(0)
+		for _, u := range us {
+			total += u
+		}
+		return total
+	}
+
+	// Populate the first generation.
+	metrics.Read(samples[0])
+
+	// Check to make sure that these metrics only grow monotonically.
+	for gen := 1; gen < 10; gen++ {
+		metrics.Read(samples[gen%2])
+		for i := range samples[gen%2] {
+			name := samples[gen%2][i].Name
+			vNew, vOld := samples[gen%2][i].Value, samples[1-(gen%2)][i].Value
+
+			switch vNew.Kind() {
+			case metrics.KindUint64:
+				new := vNew.Uint64()
+				old := vOld.Uint64()
+				if new < old {
+					t.Errorf("%s decreased: %d < %d", name, new, old)
+				}
+			case metrics.KindFloat64:
+				new := vNew.Float64()
+				old := vOld.Float64()
+				if new < old {
+					t.Errorf("%s decreased: %f < %f", name, new, old)
+				}
+			case metrics.KindFloat64Histogram:
+				new := sum(vNew.Float64Histogram().Counts)
+				old := sum(vOld.Float64Histogram().Counts)
+				if new < old {
+					t.Errorf("%s counts decreased: %d < %d", name, new, old)
+				}
+			}
+		}
+	}
+	close(done)
+
+	wg.Wait()
+}

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 9f17e47..604d0db 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go

@@ -672,7 +672,7 @@
 
 	// Assists and workers can start the moment we start
 	// the world.
-	gcController.startCycle(now, int(gomaxprocs))
+	gcController.startCycle(now, int(gomaxprocs), trigger)
 	work.heapGoal = gcController.heapGoal
 
 	// In STW mode, disable scheduling of user Gs. This may also
@@ -1297,9 +1297,9 @@
 			casgstatus(gp, _Gwaiting, _Grunning)
 		})
 
-		// Account for time.
+		// Account for time and mark us as stopped.
 		duration := nanotime() - startTime
-		gcController.logWorkTime(pp.gcMarkWorkerMode, duration)
+		gcController.markWorkerStop(pp.gcMarkWorkerMode, duration)
 		if pp.gcMarkWorkerMode == gcMarkWorkerFractionalMode {
 			atomic.Xaddint64(&pp.gcFractionalMarkTime, duration)
 		}

diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go
index 940bc52..e331386 100644
--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go

@@ -280,6 +280,40 @@
 	// dedicated mark workers get started.
 	dedicatedMarkWorkersNeeded int64
 
+	// idleMarkWorkers is two packed int32 values in a single uint64.
+	// These two values are always updated simultaneously.
+	//
+	// The bottom int32 is the current number of idle mark workers executing.
+	//
+	// The top int32 is the maximum number of idle mark workers allowed to
+	// execute concurrently. Normally, this number is just gomaxprocs. However,
+	// during periodic GC cycles it is set to 0 because the system is idle
+	// anyway; there's no need to go full blast on all of GOMAXPROCS.
+	//
+	// The maximum number of idle mark workers is used to prevent new workers
+	// from starting, but it is not a hard maximum. It is possible (but
+	// exceedingly rare) for the current number of idle mark workers to
+	// transiently exceed the maximum. This could happen if the maximum changes
+	// just after a GC ends, and an M with no P.
+	//
+	// Note that if we have no dedicated mark workers, we set this value to
+	// 1 in this case we only have fractional GC workers which aren't scheduled
+	// strictly enough to ensure GC progress. As a result, idle-priority mark
+	// workers are vital to GC progress in these situations.
+	//
+	// For example, consider a situation in which goroutines block on the GC
+	// (such as via runtime.GOMAXPROCS) and only fractional mark workers are
+	// scheduled (e.g. GOMAXPROCS=1). Without idle-priority mark workers, the
+	// last running M might skip scheduling a fractional mark worker if its
+	// utilization goal is met, such that once it goes to sleep (because there's
+	// nothing to do), there will be nothing else to spin up a new M for the
+	// fractional worker in the future, stalling GC progress and causing a
+	// deadlock. However, idle-priority workers will *always* run when there is
+	// nothing left to do, ensuring the GC makes progress.
+	//
+	// See github.com/golang/go/issues/44163 for more details.
+	idleMarkWorkers atomic.Uint64
+
 	// assistWorkPerByte is the ratio of scan work to allocated
 	// bytes that should be performed by mutator assists. This is
 	// computed at the beginning of each cycle and updated every
@@ -342,7 +376,7 @@
 // startCycle resets the GC controller's state and computes estimates
 // for a new GC cycle. The caller must hold worldsema and the world
 // must be stopped.
-func (c *gcControllerState) startCycle(markStartTime int64, procs int) {
+func (c *gcControllerState) startCycle(markStartTime int64, procs int, trigger gcTrigger) {
 	c.heapScanWork.Store(0)
 	c.stackScanWork.Store(0)
 	c.globalsScanWork.Store(0)
@@ -400,6 +434,26 @@
 		p.gcFractionalMarkTime = 0
 	}
 
+	if trigger.kind == gcTriggerTime {
+		// During a periodic GC cycle, reduce the number of idle mark workers
+		// required. However, we need at least one dedicated mark worker or
+		// idle GC worker to ensure GC progress in some scenarios (see comment
+		// on maxIdleMarkWorkers).
+		if c.dedicatedMarkWorkersNeeded > 0 {
+			c.setMaxIdleMarkWorkers(0)
+		} else {
+			// TODO(mknyszek): The fundamental reason why we need this is because
+			// we can't count on the fractional mark worker to get scheduled.
+			// Fix that by ensuring it gets scheduled according to its quota even
+			// if the rest of the application is idle.
+			c.setMaxIdleMarkWorkers(1)
+		}
+	} else {
+		// N.B. gomaxprocs and dedicatedMarkWorkersNeeded is guaranteed not to
+		// change during a GC cycle.
+		c.setMaxIdleMarkWorkers(int32(procs) - int32(c.dedicatedMarkWorkersNeeded))
+	}
+
 	// Compute initial values for controls that are updated
 	// throughout the cycle.
 	c.revise()
@@ -781,11 +835,13 @@
 	}
 }
 
-// logWorkTime updates mark work accounting in the controller by a duration of
-// work in nanoseconds.
+// markWorkerStop must be called whenever a mark worker stops executing.
+//
+// It updates mark work accounting in the controller by a duration of
+// work in nanoseconds and other bookkeeping.
 //
 // Safe to execute at any time.
-func (c *gcControllerState) logWorkTime(mode gcMarkWorkerMode, duration int64) {
+func (c *gcControllerState) markWorkerStop(mode gcMarkWorkerMode, duration int64) {
 	switch mode {
 	case gcMarkWorkerDedicatedMode:
 		atomic.Xaddint64(&c.dedicatedMarkTime, duration)
@@ -794,8 +850,9 @@
 		atomic.Xaddint64(&c.fractionalMarkTime, duration)
 	case gcMarkWorkerIdleMode:
 		atomic.Xaddint64(&c.idleMarkTime, duration)
+		c.removeIdleMarkWorker()
 	default:
-		throw("logWorkTime: unknown mark worker mode")
+		throw("markWorkerStop: unknown mark worker mode")
 	}
 }
 
@@ -1100,3 +1157,81 @@
 func (c *piController) reset() {
 	c.errIntegral = 0
 }
+
+// addIdleMarkWorker attempts to add a new idle mark worker.
+//
+// If this returns true, the caller must become an idle mark worker unless
+// there's no background mark worker goroutines in the pool. This case is
+// harmless because there are already background mark workers running.
+// If this returns false, the caller must NOT become an idle mark worker.
+//
+// nosplit because it may be called without a P.
+//go:nosplit
+func (c *gcControllerState) addIdleMarkWorker() bool {
+	for {
+		old := c.idleMarkWorkers.Load()
+		n, max := int32(old&uint64(^uint32(0))), int32(old>>32)
+		if n >= max {
+			// See the comment on idleMarkWorkers for why
+			// n > max is tolerated.
+			return false
+		}
+		if n < 0 {
+			print("n=", n, " max=", max, "\n")
+			throw("negative idle mark workers")
+		}
+		new := uint64(uint32(n+1)) | (uint64(max) << 32)
+		if c.idleMarkWorkers.CompareAndSwap(old, new) {
+			return true
+		}
+	}
+}
+
+// needIdleMarkWorker is a hint as to whether another idle mark worker is needed.
+//
+// The caller must still call addIdleMarkWorker to become one. This is mainly
+// useful for a quick check before an expensive operation.
+//
+// nosplit because it may be called without a P.
+//go:nosplit
+func (c *gcControllerState) needIdleMarkWorker() bool {
+	p := c.idleMarkWorkers.Load()
+	n, max := int32(p&uint64(^uint32(0))), int32(p>>32)
+	return n < max
+}
+
+// removeIdleMarkWorker must be called when an new idle mark worker stops executing.
+func (c *gcControllerState) removeIdleMarkWorker() {
+	for {
+		old := c.idleMarkWorkers.Load()
+		n, max := int32(old&uint64(^uint32(0))), int32(old>>32)
+		if n-1 < 0 {
+			print("n=", n, " max=", max, "\n")
+			throw("negative idle mark workers")
+		}
+		new := uint64(uint32(n-1)) | (uint64(max) << 32)
+		if c.idleMarkWorkers.CompareAndSwap(old, new) {
+			return
+		}
+	}
+}
+
+// setMaxIdleMarkWorkers sets the maximum number of idle mark workers allowed.
+//
+// This method is optimistic in that it does not wait for the number of
+// idle mark workers to reduce to max before returning; it assumes the workers
+// will deschedule themselves.
+func (c *gcControllerState) setMaxIdleMarkWorkers(max int32) {
+	for {
+		old := c.idleMarkWorkers.Load()
+		n := int32(old & uint64(^uint32(0)))
+		if n < 0 {
+			print("n=", n, " max=", max, "\n")
+			throw("negative idle mark workers")
+		}
+		new := uint64(uint32(n)) | (uint64(max) << 32)
+		if c.idleMarkWorkers.CompareAndSwap(old, new) {
+			return
+		}
+	}
+}

diff --git a/src/runtime/mgcpacer_test.go b/src/runtime/mgcpacer_test.go
index b49e3a8..2362889 100644
--- a/src/runtime/mgcpacer_test.go
+++ b/src/runtime/mgcpacer_test.go

@@ -738,3 +738,67 @@
 		}
 	})
 }
+
+func TestIdleMarkWorkerCount(t *testing.T) {
+	const workers = 10
+	c := NewGCController(100)
+	c.SetMaxIdleMarkWorkers(workers)
+	for i := 0; i < workers; i++ {
+		if !c.NeedIdleMarkWorker() {
+			t.Fatalf("expected to need idle mark workers: i=%d", i)
+		}
+		if !c.AddIdleMarkWorker() {
+			t.Fatalf("expected to be able to add an idle mark worker: i=%d", i)
+		}
+	}
+	if c.NeedIdleMarkWorker() {
+		t.Fatalf("expected to not need idle mark workers")
+	}
+	if c.AddIdleMarkWorker() {
+		t.Fatalf("expected to not be able to add an idle mark worker")
+	}
+	for i := 0; i < workers; i++ {
+		c.RemoveIdleMarkWorker()
+		if !c.NeedIdleMarkWorker() {
+			t.Fatalf("expected to need idle mark workers after removal: i=%d", i)
+		}
+	}
+	for i := 0; i < workers-1; i++ {
+		if !c.AddIdleMarkWorker() {
+			t.Fatalf("expected to be able to add idle mark workers after adding again: i=%d", i)
+		}
+	}
+	for i := 0; i < 10; i++ {
+		if !c.AddIdleMarkWorker() {
+			t.Fatalf("expected to be able to add idle mark workers interleaved: i=%d", i)
+		}
+		if c.AddIdleMarkWorker() {
+			t.Fatalf("expected to not be able to add idle mark workers interleaved: i=%d", i)
+		}
+		c.RemoveIdleMarkWorker()
+	}
+	// Support the max being below the count.
+	c.SetMaxIdleMarkWorkers(0)
+	if c.NeedIdleMarkWorker() {
+		t.Fatalf("expected to not need idle mark workers after capacity set to 0")
+	}
+	if c.AddIdleMarkWorker() {
+		t.Fatalf("expected to not be able to add idle mark workers after capacity set to 0")
+	}
+	for i := 0; i < workers-1; i++ {
+		c.RemoveIdleMarkWorker()
+	}
+	if c.NeedIdleMarkWorker() {
+		t.Fatalf("expected to not need idle mark workers after capacity set to 0")
+	}
+	if c.AddIdleMarkWorker() {
+		t.Fatalf("expected to not be able to add idle mark workers after capacity set to 0")
+	}
+	c.SetMaxIdleMarkWorkers(1)
+	if !c.NeedIdleMarkWorker() {
+		t.Fatalf("expected to need idle mark workers after capacity set to 1")
+	}
+	if !c.AddIdleMarkWorker() {
+		t.Fatalf("expected to be able to add idle mark workers after capacity set to 1")
+	}
+}

diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 5f50378..1abdbf3 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go

@@ -163,128 +163,123 @@
 	atomic.Store64(&mheap_.scavengeGoal, retainedGoal)
 }
 
+const (
+	// It doesn't really matter what value we start at, but we can't be zero, because
+	// that'll cause divide-by-zero issues. Pick something conservative which we'll
+	// also use as a fallback.
+	startingScavSleepRatio = 0.001
+
+	// Spend at least 1 ms scavenging, otherwise the corresponding
+	// sleep time to maintain our desired utilization is too low to
+	// be reliable.
+	minScavWorkTime = 1e6
+)
+
 // Sleep/wait state of the background scavenger.
-var scavenge struct {
-	lock                 mutex
-	g                    *g
-	parked               bool
-	timer                *timer
-	sysmonWake           uint32 // Set atomically.
-	printControllerReset bool   // Whether the scavenger is in cooldown.
-}
+var scavenger scavengerState
 
-// readyForScavenger signals sysmon to wake the scavenger because
-// there may be new work to do.
-//
-// There may be a significant delay between when this function runs
-// and when the scavenger is kicked awake, but it may be safely invoked
-// in contexts where wakeScavenger is unsafe to call directly.
-func readyForScavenger() {
-	atomic.Store(&scavenge.sysmonWake, 1)
-}
+type scavengerState struct {
+	// lock protects all fields below.
+	lock mutex
 
-// wakeScavenger immediately unparks the scavenger if necessary.
-//
-// May run without a P, but it may allocate, so it must not be called
-// on any allocation path.
-//
-// mheap_.lock, scavenge.lock, and sched.lock must not be held.
-func wakeScavenger() {
-	lock(&scavenge.lock)
-	if scavenge.parked {
-		// Notify sysmon that it shouldn't bother waking up the scavenger.
-		atomic.Store(&scavenge.sysmonWake, 0)
+	// g is the goroutine the scavenger is bound to.
+	g *g
 
-		// Try to stop the timer but we don't really care if we succeed.
-		// It's possible that either a timer was never started, or that
-		// we're racing with it.
-		// In the case that we're racing with there's the low chance that
-		// we experience a spurious wake-up of the scavenger, but that's
-		// totally safe.
-		stopTimer(scavenge.timer)
+	// parked is whether or not the scavenger is parked.
+	parked bool
 
-		// Unpark the goroutine and tell it that there may have been a pacing
-		// change. Note that we skip the scheduler's runnext slot because we
-		// want to avoid having the scavenger interfere with the fair
-		// scheduling of user goroutines. In effect, this schedules the
-		// scavenger at a "lower priority" but that's OK because it'll
-		// catch up on the work it missed when it does get scheduled.
-		scavenge.parked = false
+	// timer is the timer used for the scavenger to sleep.
+	timer *timer
 
-		// Ready the goroutine by injecting it. We use injectglist instead
-		// of ready or goready in order to allow us to run this function
-		// without a P. injectglist also avoids placing the goroutine in
-		// the current P's runnext slot, which is desirable to prevent
-		// the scavenger from interfering with user goroutine scheduling
-		// too much.
-		var list gList
-		list.push(scavenge.g)
-		injectglist(&list)
-	}
-	unlock(&scavenge.lock)
-}
+	// sysmonWake signals to sysmon that it should wake the scavenger.
+	sysmonWake atomic.Uint32
 
-// scavengeSleep attempts to put the scavenger to sleep for ns.
-//
-// Note that this function should only be called by the scavenger.
-//
-// The scavenger may be woken up earlier by a pacing change, and it may not go
-// to sleep at all if there's a pending pacing change.
-//
-// Returns the amount of time actually slept.
-func scavengeSleep(ns int64) int64 {
-	lock(&scavenge.lock)
+	// targetCPUFraction is the target CPU overhead for the scavenger.
+	targetCPUFraction float64
 
-	// Set the timer.
+	// sleepRatio is the ratio of time spent doing scavenging work to
+	// time spent sleeping. This is used to decide how long the scavenger
+	// should sleep for in between batches of work. It is set by
+	// critSleepController in order to maintain a CPU overhead of
+	// targetCPUFraction.
 	//
-	// This must happen here instead of inside gopark
-	// because we can't close over any variables without
-	// failing escape analysis.
-	start := nanotime()
-	resetTimer(scavenge.timer, start+ns)
+	// Lower means more sleep, higher means more aggressive scavenging.
+	sleepRatio float64
 
-	// Mark ourself as asleep and go to sleep.
-	scavenge.parked = true
-	goparkunlock(&scavenge.lock, waitReasonSleep, traceEvGoSleep, 2)
+	// sleepController controls sleepRatio.
+	//
+	// See sleepRatio for more details.
+	sleepController piController
 
-	// Return how long we actually slept for.
-	return nanotime() - start
+	// cooldown is the time left in nanoseconds during which we avoid
+	// using the controller and we hold sleepRatio at a conservative
+	// value. Used if the controller's assumptions fail to hold.
+	controllerCooldown int64
+
+	// printControllerReset instructs printScavTrace to signal that
+	// the controller was reset.
+	printControllerReset bool
+
+	// sleepStub is a stub used for testing to avoid actually having
+	// the scavenger sleep.
+	//
+	// Unlike the other stubs, this is not populated if left nil
+	// Instead, it is called when non-nil because any valid implementation
+	// of this function basically requires closing over this scavenger
+	// state, and allocating a closure is not allowed in the runtime as
+	// a matter of policy.
+	sleepStub func(n int64) int64
+
+	// scavenge is a function that scavenges n bytes of memory.
+	// Returns how many bytes of memory it actually scavenged, as
+	// well as the time it took in nanoseconds. Usually mheap.pages.scavenge
+	// with nanotime called around it, but stubbed out for testing.
+	// Like mheap.pages.scavenge, if it scavenges less than n bytes of
+	// memory, the caller may assume the heap is exhausted of scavengable
+	// memory for now.
+	//
+	// If this is nil, it is populated with the real thing in init.
+	scavenge func(n uintptr) (uintptr, int64)
+
+	// shouldStop is a callback called in the work loop and provides a
+	// point that can force the scavenger to stop early, for example because
+	// the scavenge policy dictates too much has been scavenged already.
+	//
+	// If this is nil, it is populated with the real thing in init.
+	shouldStop func() bool
+
+	// gomaxprocs returns the current value of gomaxprocs. Stub for testing.
+	//
+	// If this is nil, it is populated with the real thing in init.
+	gomaxprocs func() int32
 }
 
-// Background scavenger.
+// init initializes a scavenger state and wires to the current G.
 //
-// The background scavenger maintains the RSS of the application below
-// the line described by the proportional scavenging statistics in
-// the mheap struct.
-func bgscavenge(c chan int) {
-	scavenge.g = getg()
+// Must be called from a regular goroutine that can allocate.
+func (s *scavengerState) init() {
+	if s.g != nil {
+		throw("scavenger state is already wired")
+	}
+	lockInit(&s.lock, lockRankScavenge)
+	s.g = getg()
 
-	lockInit(&scavenge.lock, lockRankScavenge)
-	lock(&scavenge.lock)
-	scavenge.parked = true
-
-	scavenge.timer = new(timer)
-	scavenge.timer.f = func(_ any, _ uintptr) {
-		wakeScavenger()
+	s.timer = new(timer)
+	s.timer.arg = s
+	s.timer.f = func(s any, _ uintptr) {
+		s.(*scavengerState).wake()
 	}
 
-	c <- 1
-	goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
-
-	// idealFraction is the ideal % of overall application CPU time that we
-	// spend scavenging.
-	idealFraction := float64(scavengePercent) / 100.0
-
-	// Input: fraction of CPU time used.
-	// Setpoint: idealFraction.
-	// Output: ratio of critical time to sleep time (determines sleep time).
+	// input: fraction of CPU time actually used.
+	// setpoint: ideal CPU fraction.
+	// output: ratio of time worked to time slept (determines sleep time).
 	//
 	// The output of this controller is somewhat indirect to what we actually
 	// want to achieve: how much time to sleep for. The reason for this definition
 	// is to ensure that the controller's outputs have a direct relationship with
 	// its inputs (as opposed to an inverse relationship), making it somewhat
 	// easier to reason about for tuning purposes.
-	critSleepController := piController{
+	s.sleepController = piController{
 		// Tuned loosely via Ziegler-Nichols process.
 		kp: 0.3375,
 		ti: 3.2e6,
@@ -295,140 +290,278 @@
 		min: 0.001,  // 1:1000
 		max: 1000.0, // 1000:1
 	}
-	// It doesn't really matter what value we start at, but we can't be zero, because
-	// that'll cause divide-by-zero issues. Pick something conservative which we'll
-	// also use as a fallback.
-	const startingCritSleepRatio = 0.001
-	critSleepRatio := startingCritSleepRatio
-	// Duration left in nanoseconds during which we avoid using the controller and
-	// we hold critSleepRatio at a conservative value. Used if the controller's
-	// assumptions fail to hold.
-	controllerCooldown := int64(0)
-	for {
-		released := uintptr(0)
-		crit := float64(0)
+	s.sleepRatio = startingScavSleepRatio
 
-		// Spend at least 1 ms scavenging, otherwise the corresponding
-		// sleep time to maintain our desired utilization is too low to
-		// be reliable.
-		const minCritTime = 1e6
-		for crit < minCritTime {
-			// If background scavenging is disabled or if there's no work to do just park.
-			retained, goal := heapRetained(), atomic.Load64(&mheap_.scavengeGoal)
-			if retained <= goal {
-				break
-			}
-
-			// scavengeQuantum is the amount of memory we try to scavenge
-			// in one go. A smaller value means the scavenger is more responsive
-			// to the scheduler in case of e.g. preemption. A larger value means
-			// that the overheads of scavenging are better amortized, so better
-			// scavenging throughput.
-			//
-			// The current value is chosen assuming a cost of ~10µs/physical page
-			// (this is somewhat pessimistic), which implies a worst-case latency of
-			// about 160µs for 4 KiB physical pages. The current value is biased
-			// toward latency over throughput.
-			const scavengeQuantum = 64 << 10
-
-			// Accumulate the amount of time spent scavenging.
+	// Install real functions if stubs aren't present.
+	if s.scavenge == nil {
+		s.scavenge = func(n uintptr) (uintptr, int64) {
 			start := nanotime()
-			r := mheap_.pages.scavenge(scavengeQuantum)
-			atomic.Xadduintptr(&mheap_.pages.scav.released, r)
+			r := mheap_.pages.scavenge(n)
 			end := nanotime()
-
-			// On some platforms we may see end >= start if the time it takes to scavenge
-			// memory is less than the minimum granularity of its clock (e.g. Windows) or
-			// due to clock bugs.
-			//
-			// In this case, just assume scavenging takes 10 µs per regular physical page
-			// (determined empirically), and conservatively ignore the impact of huge pages
-			// on timing.
-			const approxCritNSPerPhysicalPage = 10e3
-			if end <= start {
-				crit += approxCritNSPerPhysicalPage * float64(r/physPageSize)
-			} else {
-				crit += float64(end - start)
+			if start >= end {
+				return r, 0
 			}
-			released += r
-
-			// When using fake time just do one loop.
-			if faketime != 0 {
-				break
-			}
+			return r, end - start
 		}
-
-		if released == 0 {
-			lock(&scavenge.lock)
-			scavenge.parked = true
-			goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
-			continue
+	}
+	if s.shouldStop == nil {
+		s.shouldStop = func() bool {
+			// If background scavenging is disabled or if there's no work to do just stop.
+			return heapRetained() <= atomic.Load64(&mheap_.scavengeGoal)
 		}
-
-		if released < physPageSize {
-			// If this happens, it means that we may have attempted to release part
-			// of a physical page, but the likely effect of that is that it released
-			// the whole physical page, some of which may have still been in-use.
-			// This could lead to memory corruption. Throw.
-			throw("released less than one physical page of memory")
+	}
+	if s.gomaxprocs == nil {
+		s.gomaxprocs = func() int32 {
+			return gomaxprocs
 		}
+	}
+}
 
-		if crit < minCritTime {
-			// This means there wasn't enough work to actually fill up minCritTime.
-			// That's fine; we shouldn't try to do anything with this information
-			// because it's going result in a short enough sleep request that things
-			// will get messy. Just assume we did at least this much work.
-			// All this means is that we'll sleep longer than we otherwise would have.
-			crit = minCritTime
-		}
+// park parks the scavenger goroutine.
+func (s *scavengerState) park() {
+	lock(&s.lock)
+	if getg() != s.g {
+		throw("tried to park scavenger from another goroutine")
+	}
+	s.parked = true
+	goparkunlock(&s.lock, waitReasonGCScavengeWait, traceEvGoBlock, 2)
+}
 
-		// Multiply the critical time by 1 + the ratio of the costs of using
-		// scavenged memory vs. scavenging memory. This forces us to pay down
-		// the cost of reusing this memory eagerly by sleeping for a longer period
-		// of time and scavenging less frequently. More concretely, we avoid situations
-		// where we end up scavenging so often that we hurt allocation performance
-		// because of the additional overheads of using scavenged memory.
-		crit *= 1 + scavengeCostRatio
+// ready signals to sysmon that the scavenger should be awoken.
+func (s *scavengerState) ready() {
+	s.sysmonWake.Store(1)
+}
 
-		// Go to sleep based on how much time we spent doing work.
-		slept := scavengeSleep(int64(crit / critSleepRatio))
+// wake immediately unparks the scavenger if necessary.
+//
+// Safe to run without a P.
+func (s *scavengerState) wake() {
+	lock(&s.lock)
+	if s.parked {
+		// Unset sysmonWake, since the scavenger is now being awoken.
+		s.sysmonWake.Store(0)
 
-		// Stop here if we're cooling down from the controller.
-		if controllerCooldown > 0 {
-			// crit and slept aren't exact measures of time, but it's OK to be a bit
-			// sloppy here. We're just hoping we're avoiding some transient bad behavior.
-			t := slept + int64(crit)
-			if t > controllerCooldown {
-				controllerCooldown = 0
-			} else {
-				controllerCooldown -= t
-			}
-			continue
-		}
+		// s.parked is unset to prevent a double wake-up.
+		s.parked = false
 
-		// Calculate the CPU time spent.
+		// Ready the goroutine by injecting it. We use injectglist instead
+		// of ready or goready in order to allow us to run this function
+		// without a P. injectglist also avoids placing the goroutine in
+		// the current P's runnext slot, which is desirable to prevent
+		// the scavenger from interfering with user goroutine scheduling
+		// too much.
+		var list gList
+		list.push(s.g)
+		injectglist(&list)
+	}
+	unlock(&s.lock)
+}
+
+// sleep puts the scavenger to sleep based on the amount of time that it worked
+// in nanoseconds.
+//
+// Note that this function should only be called by the scavenger.
+//
+// The scavenger may be woken up earlier by a pacing change, and it may not go
+// to sleep at all if there's a pending pacing change.
+func (s *scavengerState) sleep(worked float64) {
+	lock(&s.lock)
+	if getg() != s.g {
+		throw("tried to sleep scavenger from another goroutine")
+	}
+
+	if worked < minScavWorkTime {
+		// This means there wasn't enough work to actually fill up minScavWorkTime.
+		// That's fine; we shouldn't try to do anything with this information
+		// because it's going result in a short enough sleep request that things
+		// will get messy. Just assume we did at least this much work.
+		// All this means is that we'll sleep longer than we otherwise would have.
+		worked = minScavWorkTime
+	}
+
+	// Multiply the critical time by 1 + the ratio of the costs of using
+	// scavenged memory vs. scavenging memory. This forces us to pay down
+	// the cost of reusing this memory eagerly by sleeping for a longer period
+	// of time and scavenging less frequently. More concretely, we avoid situations
+	// where we end up scavenging so often that we hurt allocation performance
+	// because of the additional overheads of using scavenged memory.
+	worked *= 1 + scavengeCostRatio
+
+	// sleepTime is the amount of time we're going to sleep, based on the amount
+	// of time we worked, and the sleepRatio.
+	sleepTime := int64(worked / s.sleepRatio)
+
+	var slept int64
+	if s.sleepStub == nil {
+		// Set the timer.
 		//
-		// This may be slightly inaccurate with respect to GOMAXPROCS, but we're
-		// recomputing this often enough relative to GOMAXPROCS changes in general
-		// (it only changes when the world is stopped, and not during a GC) that
-		// that small inaccuracy is in the noise.
-		cpuFraction := float64(crit) / ((float64(slept) + crit) * float64(gomaxprocs))
+		// This must happen here instead of inside gopark
+		// because we can't close over any variables without
+		// failing escape analysis.
+		start := nanotime()
+		resetTimer(s.timer, start+sleepTime)
 
-		// Update the critSleepRatio, adjusting until we reach our ideal fraction.
-		var ok bool
-		critSleepRatio, ok = critSleepController.next(cpuFraction, idealFraction, float64(slept)+crit)
-		if !ok {
-			// The core assumption of the controller, that we can get a proportional
-			// response, broke down. This may be transient, so temporarily switch to
-			// sleeping a fixed, conservative amount.
-			critSleepRatio = startingCritSleepRatio
-			controllerCooldown = 5e9 // 5 seconds.
+		// Mark ourselves as asleep and go to sleep.
+		s.parked = true
+		goparkunlock(&s.lock, waitReasonSleep, traceEvGoSleep, 2)
 
-			// Signal the scav trace printer to output this.
-			lock(&scavenge.lock)
-			scavenge.printControllerReset = true
-			unlock(&scavenge.lock)
+		// How long we actually slept for.
+		slept = nanotime() - start
+
+		lock(&s.lock)
+		// Stop the timer here because s.wake is unable to do it for us.
+		// We don't really care if we succeed in stopping the timer. One
+		// reason we might fail is that we've already woken up, but the timer
+		// might be in the process of firing on some other P; essentially we're
+		// racing with it. That's totally OK. Double wake-ups are perfectly safe.
+		stopTimer(s.timer)
+		unlock(&s.lock)
+	} else {
+		unlock(&s.lock)
+		slept = s.sleepStub(sleepTime)
+	}
+
+	// Stop here if we're cooling down from the controller.
+	if s.controllerCooldown > 0 {
+		// worked and slept aren't exact measures of time, but it's OK to be a bit
+		// sloppy here. We're just hoping we're avoiding some transient bad behavior.
+		t := slept + int64(worked)
+		if t > s.controllerCooldown {
+			s.controllerCooldown = 0
+		} else {
+			s.controllerCooldown -= t
 		}
+		return
+	}
+
+	// idealFraction is the ideal % of overall application CPU time that we
+	// spend scavenging.
+	idealFraction := float64(scavengePercent) / 100.0
+
+	// Calculate the CPU time spent.
+	//
+	// This may be slightly inaccurate with respect to GOMAXPROCS, but we're
+	// recomputing this often enough relative to GOMAXPROCS changes in general
+	// (it only changes when the world is stopped, and not during a GC) that
+	// that small inaccuracy is in the noise.
+	cpuFraction := worked / ((float64(slept) + worked) * float64(s.gomaxprocs()))
+
+	// Update the critSleepRatio, adjusting until we reach our ideal fraction.
+	var ok bool
+	s.sleepRatio, ok = s.sleepController.next(cpuFraction, idealFraction, float64(slept)+worked)
+	if !ok {
+		// The core assumption of the controller, that we can get a proportional
+		// response, broke down. This may be transient, so temporarily switch to
+		// sleeping a fixed, conservative amount.
+		s.sleepRatio = startingScavSleepRatio
+		s.controllerCooldown = 5e9 // 5 seconds.
+
+		// Signal the scav trace printer to output this.
+		s.controllerFailed()
+	}
+}
+
+// controllerFailed indicates that the scavenger's scheduling
+// controller failed.
+func (s *scavengerState) controllerFailed() {
+	lock(&s.lock)
+	s.printControllerReset = true
+	unlock(&s.lock)
+}
+
+// run is the body of the main scavenging loop.
+//
+// Returns the number of bytes released and the estimated time spent
+// releasing those bytes.
+//
+// Must be run on the scavenger goroutine.
+func (s *scavengerState) run() (released uintptr, worked float64) {
+	lock(&s.lock)
+	if getg() != s.g {
+		throw("tried to run scavenger from another goroutine")
+	}
+	unlock(&s.lock)
+
+	for worked < minScavWorkTime {
+		// If something from outside tells us to stop early, stop.
+		if s.shouldStop() {
+			break
+		}
+
+		// scavengeQuantum is the amount of memory we try to scavenge
+		// in one go. A smaller value means the scavenger is more responsive
+		// to the scheduler in case of e.g. preemption. A larger value means
+		// that the overheads of scavenging are better amortized, so better
+		// scavenging throughput.
+		//
+		// The current value is chosen assuming a cost of ~10µs/physical page
+		// (this is somewhat pessimistic), which implies a worst-case latency of
+		// about 160µs for 4 KiB physical pages. The current value is biased
+		// toward latency over throughput.
+		const scavengeQuantum = 64 << 10
+
+		// Accumulate the amount of time spent scavenging.
+		r, duration := s.scavenge(scavengeQuantum)
+
+		// On some platforms we may see end >= start if the time it takes to scavenge
+		// memory is less than the minimum granularity of its clock (e.g. Windows) or
+		// due to clock bugs.
+		//
+		// In this case, just assume scavenging takes 10 µs per regular physical page
+		// (determined empirically), and conservatively ignore the impact of huge pages
+		// on timing.
+		const approxWorkedNSPerPhysicalPage = 10e3
+		if duration == 0 {
+			worked += approxWorkedNSPerPhysicalPage * float64(r/physPageSize)
+		} else {
+			// TODO(mknyszek): If duration is small compared to worked, it could be
+			// rounded down to zero. Probably not a problem in practice because the
+			// values are all within a few orders of magnitude of each other but maybe
+			// worth worrying about.
+			worked += float64(duration)
+		}
+		released += r
+
+		// scavenge does not return until it either finds the requisite amount of
+		// memory to scavenge, or exhausts the heap. If we haven't found enough
+		// to scavenge, then the heap must be exhausted.
+		if r < scavengeQuantum {
+			break
+		}
+		// When using fake time just do one loop.
+		if faketime != 0 {
+			break
+		}
+	}
+	if released > 0 && released < physPageSize {
+		// If this happens, it means that we may have attempted to release part
+		// of a physical page, but the likely effect of that is that it released
+		// the whole physical page, some of which may have still been in-use.
+		// This could lead to memory corruption. Throw.
+		throw("released less than one physical page of memory")
+	}
+	return
+}
+
+// Background scavenger.
+//
+// The background scavenger maintains the RSS of the application below
+// the line described by the proportional scavenging statistics in
+// the mheap struct.
+func bgscavenge(c chan int) {
+	scavenger.init()
+
+	c <- 1
+	scavenger.park()
+
+	for {
+		released, workTime := scavenger.run()
+		if released == 0 {
+			scavenger.park()
+			continue
+		}
+		atomic.Xadduintptr(&mheap_.pages.scav.released, released)
+		scavenger.sleep(workTime)
 	}
 }
 
@@ -438,6 +571,9 @@
 // back to the top of the heap.
 //
 // Returns the amount of memory scavenged in bytes.
+//
+// scavenge always tries to scavenge nbytes worth of memory, and will
+// only fail to do so if the heap is exhausted for now.
 func (p *pageAlloc) scavenge(nbytes uintptr) uintptr {
 	var (
 		addrs addrRange
@@ -468,9 +604,9 @@
 // was called, and forced indicates whether the scavenge was forced by the
 // application.
 //
-// scavenge.lock must be held.
+// scavenger.lock must be held.
 func printScavTrace(gen uint32, released uintptr, forced bool) {
-	assertLockHeld(&scavenge.lock)
+	assertLockHeld(&scavenger.lock)
 
 	printlock()
 	print("scav ", gen, " ",
@@ -480,9 +616,9 @@
 	)
 	if forced {
 		print(" (forced)")
-	} else if scavenge.printControllerReset {
+	} else if scavenger.printControllerReset {
 		print(" [controller reset]")
-		scavenge.printControllerReset = false
+		scavenger.printControllerReset = false
 	}
 	println()
 	printunlock()

diff --git a/src/runtime/mgcscavenge_test.go b/src/runtime/mgcscavenge_test.go
index 0659293..8d92295 100644
--- a/src/runtime/mgcscavenge_test.go
+++ b/src/runtime/mgcscavenge_test.go

@@ -9,7 +9,9 @@
 	"internal/goos"
 	"math/rand"
 	. "runtime"
+	"runtime/internal/atomic"
 	"testing"
+	"time"
 )
 
 // makePallocData produces an initialized PallocData by setting
@@ -449,3 +451,113 @@
 		})
 	}
 }
+
+func TestScavenger(t *testing.T) {
+	// workedTime is a standard conversion of bytes of scavenge
+	// work to time elapsed.
+	workedTime := func(bytes uintptr) int64 {
+		return int64((bytes+4095)/4096) * int64(10*time.Microsecond)
+	}
+
+	// Set up a bunch of state that we're going to track and verify
+	// throughout the test.
+	totalWork := uint64(64<<20 - 3*PhysPageSize)
+	var totalSlept, totalWorked atomic.Int64
+	var availableWork atomic.Uint64
+	var stopAt atomic.Uint64 // How much available work to stop at.
+
+	// Set up the scavenger.
+	var s Scavenger
+	s.Sleep = func(ns int64) int64 {
+		totalSlept.Add(ns)
+		return ns
+	}
+	s.Scavenge = func(bytes uintptr) (uintptr, int64) {
+		avail := availableWork.Load()
+		if uint64(bytes) > avail {
+			bytes = uintptr(avail)
+		}
+		t := workedTime(bytes)
+		if bytes != 0 {
+			availableWork.Add(-int64(bytes))
+			totalWorked.Add(t)
+		}
+		return bytes, t
+	}
+	s.ShouldStop = func() bool {
+		if availableWork.Load() <= stopAt.Load() {
+			return true
+		}
+		return false
+	}
+	s.GoMaxProcs = func() int32 {
+		return 1
+	}
+
+	// Define a helper for verifying that various properties hold.
+	verifyScavengerState := func(t *testing.T, expWork uint64) {
+		t.Helper()
+
+		// Check to make sure it did the amount of work we expected.
+		if workDone := uint64(s.Released()); workDone != expWork {
+			t.Errorf("want %d bytes of work done, got %d", expWork, workDone)
+		}
+		// Check to make sure the scavenger is meeting its CPU target.
+		idealFraction := float64(ScavengePercent) / 100.0
+		cpuFraction := float64(totalWorked.Load()) / float64(totalWorked.Load()+totalSlept.Load())
+		if cpuFraction < idealFraction-0.005 || cpuFraction > idealFraction+0.005 {
+			t.Errorf("want %f CPU fraction, got %f", idealFraction, cpuFraction)
+		}
+	}
+
+	// Start the scavenger.
+	s.Start()
+
+	// Set up some work and let the scavenger run to completion.
+	availableWork.Store(totalWork)
+	s.Wake()
+	if !s.BlockUntilParked(2e9 /* 2 seconds */) {
+		t.Fatal("timed out waiting for scavenger to run to completion")
+	}
+	// Run a check.
+	verifyScavengerState(t, totalWork)
+
+	// Now let's do it again and see what happens when we have no work to do.
+	// It should've gone right back to sleep.
+	s.Wake()
+	if !s.BlockUntilParked(2e9 /* 2 seconds */) {
+		t.Fatal("timed out waiting for scavenger to run to completion")
+	}
+	// Run another check.
+	verifyScavengerState(t, totalWork)
+
+	// One more time, this time doing the same amount of work as the first time.
+	// Let's see if we can get the scavenger to continue.
+	availableWork.Store(totalWork)
+	s.Wake()
+	if !s.BlockUntilParked(2e9 /* 2 seconds */) {
+		t.Fatal("timed out waiting for scavenger to run to completion")
+	}
+	// Run another check.
+	verifyScavengerState(t, 2*totalWork)
+
+	// This time, let's stop after a certain amount of work.
+	//
+	// Pick a stopping point such that when subtracted from totalWork
+	// we get a multiple of a relatively large power of 2. verifyScavengerState
+	// always makes an exact check, but the scavenger might go a little over,
+	// which is OK. If this breaks often or gets annoying to maintain, modify
+	// verifyScavengerState.
+	availableWork.Store(totalWork)
+	stoppingPoint := uint64(1<<20 - 3*PhysPageSize)
+	stopAt.Store(stoppingPoint)
+	s.Wake()
+	if !s.BlockUntilParked(2e9 /* 2 seconds */) {
+		t.Fatal("timed out waiting for scavenger to run to completion")
+	}
+	// Run another check.
+	verifyScavengerState(t, 2*totalWork+(totalWork-stoppingPoint))
+
+	// Clean up.
+	s.Stop()
+}

diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index d0b81fd..a5e04d6 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go

@@ -263,7 +263,7 @@
 	// Sweeping is done, so if the scavenger isn't already awake,
 	// wake it up. There's definitely work for it to do at this
 	// point.
-	wakeScavenger()
+	scavenger.wake()
 
 	nextMarkBitArenaEpoch()
 }
@@ -403,10 +403,7 @@
 			mheap_.pages.scavengeStartGen()
 			unlock(&mheap_.lock)
 		})
-		// Since we might sweep in an allocation path, it's not possible
-		// for us to wake the scavenger directly via wakeScavenger, since
-		// it could allocate. Ask sysmon to do it for us instead.
-		readyForScavenger()
+		scavenger.ready()
 	}
 
 	gp.m.locks--

diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index d99363d..1c98afc 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go

@@ -449,16 +449,17 @@
 	// if sweepgen == h->sweepgen + 3, the span was swept and then cached and is still cached
 	// h->sweepgen is incremented by 2 after every GC
 
-	sweepgen    uint32
-	divMul      uint32        // for divide by elemsize
-	allocCount  uint16        // number of allocated objects
-	spanclass   spanClass     // size class and noscan (uint8)
-	state       mSpanStateBox // mSpanInUse etc; accessed atomically (get/set methods)
-	needzero    uint8         // needs to be zeroed before allocation
-	elemsize    uintptr       // computed from sizeclass or from npages
-	limit       uintptr       // end of data in span
-	speciallock mutex         // guards specials list
-	specials    *special      // linked list of special records sorted by offset.
+	sweepgen              uint32
+	divMul                uint32        // for divide by elemsize
+	allocCount            uint16        // number of allocated objects
+	spanclass             spanClass     // size class and noscan (uint8)
+	state                 mSpanStateBox // mSpanInUse etc; accessed atomically (get/set methods)
+	needzero              uint8         // needs to be zeroed before allocation
+	allocCountBeforeCache uint16        // a copy of allocCount that is stored just before this span is cached
+	elemsize              uintptr       // computed from sizeclass or from npages
+	limit                 uintptr       // end of data in span
+	speciallock           mutex         // guards specials list
+	specials              *special      // linked list of special records sorted by offset.
 }
 
 func (s *mspan) base() uintptr {

diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 4535f62..b72194c 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go

@@ -2351,6 +2351,11 @@
 		startm(_p_, false)
 		return
 	}
+	// if there's trace work to do, start it straight away
+	if (trace.enabled || trace.shutdown) && traceReaderAvailable() {
+		startm(_p_, false)
+		return
+	}
 	// if it has GC work, start it straight away
 	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) {
 		startm(_p_, false)
@@ -2535,7 +2540,9 @@
 
 // Finds a runnable goroutine to execute.
 // Tries to steal from other P's, get g from local or global queue, poll network.
-func findrunnable() (gp *g, inheritTime bool) {
+// tryWakeP indicates that the returned goroutine is not normal (GC worker, trace
+// reader) so the caller should try to wake a P.
+func findRunnable() (gp *g, inheritTime, tryWakeP bool) {
 	_g_ := getg()
 
 	// The conditions here and in handoffp must agree: if
@@ -2552,8 +2559,43 @@
 		runSafePointFn()
 	}
 
+	// now and pollUntil are saved for work stealing later,
+	// which may steal timers. It's important that between now
+	// and then, nothing blocks, so these numbers remain mostly
+	// relevant.
 	now, pollUntil, _ := checkTimers(_p_, 0)
 
+	// Try to schedule the trace reader.
+	if trace.enabled || trace.shutdown {
+		gp = traceReader()
+		if gp != nil {
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			traceGoUnpark(gp, 0)
+			return gp, false, true
+		}
+	}
+
+	// Try to schedule a GC worker.
+	if gcBlackenEnabled != 0 {
+		gp = gcController.findRunnableGCWorker(_p_)
+		if gp != nil {
+			return gp, false, true
+		}
+	}
+
+	// Check the global runnable queue once in a while to ensure fairness.
+	// Otherwise two goroutines can completely occupy the local runqueue
+	// by constantly respawning each other.
+	if _p_.schedtick%61 == 0 && sched.runqsize > 0 {
+		lock(&sched.lock)
+		gp = globrunqget(_p_, 1)
+		unlock(&sched.lock)
+		if gp != nil {
+			return gp, false, false
+		}
+	}
+
+	// Wake up the finalizer G.
 	if fingwait && fingwake {
 		if gp := wakefing(); gp != nil {
 			ready(gp, 0, true)
@@ -2565,7 +2607,7 @@
 
 	// local runq
 	if gp, inheritTime := runqget(_p_); gp != nil {
-		return gp, inheritTime
+		return gp, inheritTime, false
 	}
 
 	// global runq
@@ -2574,7 +2616,7 @@
 		gp := globrunqget(_p_, 0)
 		unlock(&sched.lock)
 		if gp != nil {
-			return gp, false
+			return gp, false, false
 		}
 	}
 
@@ -2593,7 +2635,7 @@
 			if trace.enabled {
 				traceGoUnpark(gp, 0)
 			}
-			return gp, false
+			return gp, false, false
 		}
 	}
 
@@ -2613,7 +2655,7 @@
 		now = tnow
 		if gp != nil {
 			// Successfully stole.
-			return gp, inheritTime
+			return gp, inheritTime, false
 		}
 		if newWork {
 			// There may be new timer or GC work; restart to
@@ -2629,9 +2671,8 @@
 	// We have nothing to do.
 	//
 	// If we're in the GC mark phase, can safely scan and blacken objects,
-	// and have work to do, run idle-time marking rather than give up the
-	// P.
-	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) {
+	// and have work to do, run idle-time marking rather than give up the P.
+	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) && gcController.addIdleMarkWorker() {
 		node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
 		if node != nil {
 			_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
@@ -2640,8 +2681,9 @@
 			if trace.enabled {
 				traceGoUnpark(gp, 0)
 			}
-			return gp, false
+			return gp, false, false
 		}
+		gcController.removeIdleMarkWorker()
 	}
 
 	// wasm only:
@@ -2654,7 +2696,7 @@
 		if trace.enabled {
 			traceGoUnpark(gp, 0)
 		}
-		return gp, false
+		return gp, false, false
 	}
 	if otherReady {
 		goto top
@@ -2679,7 +2721,7 @@
 	if sched.runqsize != 0 {
 		gp := globrunqget(_p_, 0)
 		unlock(&sched.lock)
-		return gp, false
+		return gp, false, false
 	}
 	if releasep() != _p_ {
 		throw("findrunnable: wrong p")
@@ -2742,7 +2784,7 @@
 			if trace.enabled {
 				traceGoUnpark(gp, 0)
 			}
-			return gp, false
+			return gp, false, false
 		}
 
 		// Finally, check for timer creation or expiry concurrently with
@@ -2800,7 +2842,7 @@
 				if trace.enabled {
 					traceGoUnpark(gp, 0)
 				}
-				return gp, false
+				return gp, false, false
 			}
 			if wasSpinning {
 				_g_.m.spinning = true
@@ -2959,8 +3001,12 @@
 // returned. The returned P has not been wired yet.
 func checkIdleGCNoP() (*p, *g) {
 	// N.B. Since we have no P, gcBlackenEnabled may change at any time; we
-	// must check again after acquiring a P.
-	if atomic.Load(&gcBlackenEnabled) == 0 {
+	// must check again after acquiring a P. As an optimization, we also check
+	// if an idle mark worker is needed at all. This is OK here, because if we
+	// observe that one isn't needed, at least one is currently running. Even if
+	// it stops running, its own journey into the scheduler should schedule it
+	// again, if need be (at which point, this check will pass, if relevant).
+	if atomic.Load(&gcBlackenEnabled) == 0 || !gcController.needIdleMarkWorker() {
 		return nil, nil
 	}
 	if !gcMarkWorkAvailable(nil) {
@@ -2991,9 +3037,8 @@
 		return nil, nil
 	}
 
-	// Now that we own a P, gcBlackenEnabled can't change (as it requires
-	// STW).
-	if gcBlackenEnabled == 0 {
+	// Now that we own a P, gcBlackenEnabled can't change (as it requires STW).
+	if gcBlackenEnabled == 0 || !gcController.addIdleMarkWorker() {
 		pidleput(pp)
 		unlock(&sched.lock)
 		return nil, nil
@@ -3003,6 +3048,7 @@
 	if node == nil {
 		pidleput(pp)
 		unlock(&sched.lock)
+		gcController.removeIdleMarkWorker()
 		return nil, nil
 	}
 
@@ -3143,62 +3189,14 @@
 	pp := _g_.m.p.ptr()
 	pp.preempt = false
 
-	if sched.gcwaiting != 0 {
-		gcstopm()
-		goto top
-	}
-	if pp.runSafePointFn != 0 {
-		runSafePointFn()
-	}
-
-	// Sanity check: if we are spinning, the run queue should be empty.
+	// Safety check: if we are spinning, the run queue should be empty.
 	// Check this before calling checkTimers, as that might call
 	// goready to put a ready goroutine on the local run queue.
 	if _g_.m.spinning && (pp.runnext != 0 || pp.runqhead != pp.runqtail) {
 		throw("schedule: spinning with local work")
 	}
 
-	checkTimers(pp, 0)
-
-	var gp *g
-	var inheritTime bool
-
-	// Normal goroutines will check for need to wakeP in ready,
-	// but GCworkers and tracereaders will not, so the check must
-	// be done here instead.
-	tryWakeP := false
-	if trace.enabled || trace.shutdown {
-		gp = traceReader()
-		if gp != nil {
-			casgstatus(gp, _Gwaiting, _Grunnable)
-			traceGoUnpark(gp, 0)
-			tryWakeP = true
-		}
-	}
-	if gp == nil && gcBlackenEnabled != 0 {
-		gp = gcController.findRunnableGCWorker(_g_.m.p.ptr())
-		if gp != nil {
-			tryWakeP = true
-		}
-	}
-	if gp == nil {
-		// Check the global runnable queue once in a while to ensure fairness.
-		// Otherwise two goroutines can completely occupy the local runqueue
-		// by constantly respawning each other.
-		if _g_.m.p.ptr().schedtick%61 == 0 && sched.runqsize > 0 {
-			lock(&sched.lock)
-			gp = globrunqget(_g_.m.p.ptr(), 1)
-			unlock(&sched.lock)
-		}
-	}
-	if gp == nil {
-		gp, inheritTime = runqget(_g_.m.p.ptr())
-		// We can see gp != nil here even if the M is spinning,
-		// if checkTimers added a local goroutine via goready.
-	}
-	if gp == nil {
-		gp, inheritTime = findrunnable() // blocks until work is available
-	}
+	gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available
 
 	// This thread is going to run a goroutine and is not spinning anymore,
 	// so if it was marked as spinning we need to reset it now and potentially
@@ -3658,7 +3656,7 @@
 
 // Standard syscall entry used by the go syscall library and normal cgo calls.
 //
-// This is exported via linkname to assembly in the syscall package.
+// This is exported via linkname to assembly in the syscall package and x/sys.
 //
 //go:nosplit
 //go:linkname entersyscall
@@ -5184,9 +5182,9 @@
 				startm(nil, false)
 			}
 		}
-		if atomic.Load(&scavenge.sysmonWake) != 0 {
+		if scavenger.sysmonWake.Load() != 0 {
 			// Kick the scavenger awake if someone requested it.
-			wakeScavenger()
+			scavenger.wake()
 		}
 		// retake P's blocked in syscalls
 		// and preempt long running G's
@@ -5904,10 +5902,10 @@
 						// between different Ps.
 						// A sync chan send/recv takes ~50ns as of time of
 						// writing, so 3us gives ~50x overshoot.
-						if GOOS != "windows" {
+						if GOOS != "windows" && GOOS != "openbsd" {
 							usleep(3)
 						} else {
-							// On windows system timer granularity is
+							// On some platforms system timer granularity is
 							// 1-15ms, which is way too much for this
 							// optimization. So just yield.
 							osyield()

diff --git a/src/runtime/race/README b/src/runtime/race/README
index 7ec2f80..eb18ad6 100644
--- a/src/runtime/race/README
+++ b/src/runtime/race/README

@@ -13,3 +13,4 @@
 race_linux_arm64.syso built with LLVM 41cb504b7c4b18ac15830107431a0c1eec73a6b2 and Go 851ecea4cc99ab276109493477b2c7e30c253ea8.
 race_darwin_arm64.syso built with LLVM 41cb504b7c4b18ac15830107431a0c1eec73a6b2 and Go 851ecea4cc99ab276109493477b2c7e30c253ea8.
 race_openbsd_amd64.syso built with LLVM fcf6ae2f070eba73074b6ec8d8281e54d29dbeeb and Go 8f2db14cd35bbd674cb2988a508306de6655e425.
+race_linux_s390x.syso built with LLVM 41cb504b7c4b18ac15830107431a0c1eec73a6b2 and Go 851ecea4cc99ab276109493477b2c7e30c253ea8.

diff --git a/src/runtime/race/race_linux_s390x.syso b/src/runtime/race/race_linux_s390x.syso
new file mode 100644
index 0000000..ed4a300
--- /dev/null
+++ b/src/runtime/race/race_linux_s390x.syso
Binary files differ

diff --git a/src/runtime/rand_test.go b/src/runtime/rand_test.go
index 1b84c79..92d07eb 100644
--- a/src/runtime/rand_test.go
+++ b/src/runtime/rand_test.go

@@ -18,6 +18,14 @@
 	})
 }
 
+func BenchmarkFastrand64(b *testing.B) {
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			Fastrand64()
+		}
+	})
+}
+
 func BenchmarkFastrandHashiter(b *testing.B) {
 	var m = make(map[int]int, 10)
 	for i := 0; i < 10; i++ {

diff --git a/src/runtime/signal_amd64.go b/src/runtime/signal_amd64.go
index 67a2195..8ade208 100644
--- a/src/runtime/signal_amd64.go
+++ b/src/runtime/signal_amd64.go

@@ -40,9 +40,10 @@
 //go:nowritebarrierrec
 func (c *sigctxt) sigpc() uintptr { return uintptr(c.rip()) }
 
-func (c *sigctxt) sigsp() uintptr { return uintptr(c.rsp()) }
-func (c *sigctxt) siglr() uintptr { return 0 }
-func (c *sigctxt) fault() uintptr { return uintptr(c.sigaddr()) }
+func (c *sigctxt) setsigpc(x uint64) { c.set_rip(x) }
+func (c *sigctxt) sigsp() uintptr    { return uintptr(c.rsp()) }
+func (c *sigctxt) siglr() uintptr    { return 0 }
+func (c *sigctxt) fault() uintptr    { return uintptr(c.sigaddr()) }
 
 // preparePanic sets up the stack to look like a call to sigpanic.
 func (c *sigctxt) preparePanic(sig uint32, gp *g) {

diff --git a/src/runtime/signal_arm64.go b/src/runtime/signal_arm64.go
index 771585a..c8b8781 100644
--- a/src/runtime/signal_arm64.go
+++ b/src/runtime/signal_arm64.go

@@ -53,8 +53,9 @@
 //go:nowritebarrierrec
 func (c *sigctxt) sigpc() uintptr { return uintptr(c.pc()) }
 
-func (c *sigctxt) sigsp() uintptr { return uintptr(c.sp()) }
-func (c *sigctxt) siglr() uintptr { return uintptr(c.lr()) }
+func (c *sigctxt) setsigpc(x uint64) { c.set_pc(x) }
+func (c *sigctxt) sigsp() uintptr    { return uintptr(c.sp()) }
+func (c *sigctxt) siglr() uintptr    { return uintptr(c.lr()) }
 
 // preparePanic sets up the stack to look like a call to sigpanic.
 func (c *sigctxt) preparePanic(sig uint32, gp *g) {

diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index 8c4ab3e..929f8fa 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go

@@ -157,11 +157,50 @@
 	return uint32(uint64(fastrand()) * uint64(n) >> 32)
 }
 
+func fastrand64() uint64 {
+	mp := getg().m
+	// Implement wyrand: https://github.com/wangyi-fudan/wyhash
+	// Only the platform that math.Mul64 can be lowered
+	// by the compiler should be in this list.
+	if goarch.IsAmd64|goarch.IsArm64|goarch.IsPpc64|
+		goarch.IsPpc64le|goarch.IsMips64|goarch.IsMips64le|
+		goarch.IsS390x|goarch.IsRiscv64 == 1 {
+		mp.fastrand += 0xa0761d6478bd642f
+		hi, lo := math.Mul64(mp.fastrand, mp.fastrand^0xe7037ed1a0b428db)
+		return hi ^ lo
+	}
+
+	// Implement xorshift64+: 2 32-bit xorshift sequences added together.
+	// Xorshift paper: https://www.jstatsoft.org/article/view/v008i14/xorshift.pdf
+	// This generator passes the SmallCrush suite, part of TestU01 framework:
+	// http://simul.iro.umontreal.ca/testu01/tu01.html
+	t := (*[2]uint32)(unsafe.Pointer(&mp.fastrand))
+	s1, s0 := t[0], t[1]
+	s1 ^= s1 << 17
+	s1 = s1 ^ s0 ^ s1>>7 ^ s0>>16
+	r := uint64(s0 + s1)
+
+	s0, s1 = s1, s0
+	s1 ^= s1 << 17
+	s1 = s1 ^ s0 ^ s1>>7 ^ s0>>16
+	r += uint64(s0+s1) << 32
+
+	t[0], t[1] = s0, s1
+	return r
+}
+
+func fastrandu() uint {
+	if goarch.PtrSize == 4 {
+		return uint(fastrand())
+	}
+	return uint(fastrand64())
+}
+
 //go:linkname sync_fastrandn sync.fastrandn
 func sync_fastrandn(n uint32) uint32 { return fastrandn(n) }
 
-//go:linkname net_fastrand net.fastrand
-func net_fastrand() uint32 { return fastrand() }
+//go:linkname net_fastrandu net.fastrandu
+func net_fastrandu() uint { return fastrandu() }
 
 //go:linkname os_fastrand os.fastrand
 func os_fastrand() uint32 { return fastrand() }

diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s
index db4715d..8e75a38 100644
--- a/src/runtime/sys_darwin_amd64.s
+++ b/src/runtime/sys_darwin_amd64.s

@@ -218,13 +218,21 @@
 	// Transition from C ABI to Go ABI.
 	PUSH_REGS_HOST_TO_ABI0()
 
-	// Call into the Go signal handler
+	// Set up ABIInternal environment: g in R14, cleared X15.
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	PXOR	X15, X15
+
+	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
-	ADJSP	$24
-	MOVL	DI, 0(SP)	// sig
-	MOVQ	SI, 8(SP)	// info
-	MOVQ	DX, 16(SP)	// ctx
-	CALL	·sigtrampgo(SB)
+	ADJSP   $24
+
+	// Call into the Go signal handler
+	MOVQ	DI, AX	// sig
+	MOVQ	SI, BX	// info
+	MOVQ	DX, CX	// ctx
+	CALL	·sigtrampgo<ABIInternal>(SB)
+
 	ADJSP	$-24
 
 	POP_REGS_HOST_TO_ABI0()

diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s
index 684c9ab..48b04ff 100644
--- a/src/runtime/sys_dragonfly_amd64.s
+++ b/src/runtime/sys_dragonfly_amd64.s

@@ -226,13 +226,21 @@
 	// Transition from C ABI to Go ABI.
 	PUSH_REGS_HOST_TO_ABI0()
 
-	// Call into the Go signal handler
+	// Set up ABIInternal environment: g in R14, cleared X15.
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	PXOR	X15, X15
+
+	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
-	ADJSP	$24
-	MOVQ	DI, 0(SP)	// sig
-	MOVQ	SI, 8(SP)	// info
-	MOVQ	DX, 16(SP)	// ctx
-	CALL	·sigtrampgo(SB)
+	ADJSP   $24
+
+	// Call into the Go signal handler
+	MOVQ	DI, AX	// sig
+	MOVQ	SI, BX	// info
+	MOVQ	DX, CX	// ctx
+	CALL	·sigtrampgo<ABIInternal>(SB)
+
 	ADJSP	$-24
 
 	POP_REGS_HOST_TO_ABI0()

diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s
index cc95da7..159f5c6 100644
--- a/src/runtime/sys_freebsd_amd64.s
+++ b/src/runtime/sys_freebsd_amd64.s

@@ -228,13 +228,21 @@
 	// Transition from C ABI to Go ABI.
 	PUSH_REGS_HOST_TO_ABI0()
 
-	// Call into the Go signal handler
+	// Set up ABIInternal environment: g in R14, cleared X15.
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	PXOR	X15, X15
+
+	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
-        ADJSP   $24
-	MOVQ	DI, 0(SP)	// sig
-	MOVQ	SI, 8(SP)	// info
-	MOVQ	DX, 16(SP)	// ctx
-	CALL	·sigtrampgo(SB)
+	ADJSP   $24
+
+	// Call into the Go signal handler
+	MOVQ	DI, AX	// sig
+	MOVQ	SI, BX	// info
+	MOVQ	DX, CX	// ctx
+	CALL	·sigtrampgo<ABIInternal>(SB)
+
 	ADJSP	$-24
 
         POP_REGS_HOST_TO_ABI0()
@@ -245,13 +253,21 @@
 	// Transition from C ABI to Go ABI.
 	PUSH_REGS_HOST_TO_ABI0()
 
-	// Call into the Go signal handler
+	// Set up ABIInternal environment: g in R14, cleared X15.
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	PXOR	X15, X15
+
+	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
-	ADJSP	$24
-	MOVL	DI, 0(SP)	// sig
-	MOVQ	SI, 8(SP)	// info
-	MOVQ	DX, 16(SP)	// ctx
-	CALL	·sigprofNonGo(SB)
+	ADJSP   $24
+
+	// Call into the Go signal handler
+	MOVQ	DI, AX	// sig
+	MOVQ	SI, BX	// info
+	MOVQ	DX, CX	// ctx
+	CALL	·sigprofNonGo<ABIInternal>(SB)
+
 	ADJSP	$-24
 
 	POP_REGS_HOST_TO_ABI0()

diff --git a/src/runtime/sys_freebsd_arm64.s b/src/runtime/sys_freebsd_arm64.s
index 7b05fb0..a4f12eb 100644
--- a/src/runtime/sys_freebsd_arm64.s
+++ b/src/runtime/sys_freebsd_arm64.s

@@ -295,10 +295,16 @@
 	BEQ	2(PC)
 	BL	runtime·load_g(SB)
 
+#ifdef GOEXPERIMENT_regabiargs
+	// Restore signum to R0.
+	MOVW	8(RSP), R0
+	// R1 and R2 already contain info and ctx, respectively.
+#else
 	MOVD	R1, 16(RSP)
 	MOVD	R2, 24(RSP)
-	MOVD	$runtime·sigtrampgo(SB), R0
-	BL	(R0)
+#endif
+	MOVD	$runtime·sigtrampgo<ABIInternal>(SB), R3
+	BL	(R3)
 
 	// Restore callee-save registers.
 	RESTORE_R19_TO_R28(8*4)

diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index 4be0801..f2dfbbb7 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s

@@ -343,13 +343,21 @@
 	// Transition from C ABI to Go ABI.
 	PUSH_REGS_HOST_TO_ABI0()
 
-	// Call into the Go signal handler
+	// Set up ABIInternal environment: g in R14, cleared X15.
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	PXOR	X15, X15
+
+	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
-        ADJSP   $24
-	MOVQ	DI, 0(SP)	// sig
-	MOVQ	SI, 8(SP)	// info
-	MOVQ	DX, 16(SP)	// ctx
-	CALL	·sigtrampgo(SB)
+	ADJSP   $24
+
+	// Call into the Go signal handler
+	MOVQ	DI, AX	// sig
+	MOVQ	SI, BX	// info
+	MOVQ	DX, CX	// ctx
+	CALL	·sigtrampgo<ABIInternal>(SB)
+
 	ADJSP	$-24
 
         POP_REGS_HOST_TO_ABI0()
@@ -360,13 +368,21 @@
 	// Transition from C ABI to Go ABI.
 	PUSH_REGS_HOST_TO_ABI0()
 
-	// Call into the Go signal handler
+	// Set up ABIInternal environment: g in R14, cleared X15.
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	PXOR	X15, X15
+
+	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
-	ADJSP	$24
-	MOVL	DI, 0(SP)	// sig
-	MOVQ	SI, 8(SP)	// info
-	MOVQ	DX, 16(SP)	// ctx
-	CALL	·sigprofNonGo(SB)
+	ADJSP   $24
+
+	// Call into the Go signal handler
+	MOVQ	DI, AX	// sig
+	MOVQ	SI, BX	// info
+	MOVQ	DX, CX	// ctx
+	CALL	·sigprofNonGo<ABIInternal>(SB)
+
 	ADJSP	$-24
 
 	POP_REGS_HOST_TO_ABI0()

diff --git a/src/runtime/sys_linux_arm64.s b/src/runtime/sys_linux_arm64.s
index 36ac014..8e7cbf7 100644
--- a/src/runtime/sys_linux_arm64.s
+++ b/src/runtime/sys_linux_arm64.s

@@ -459,10 +459,16 @@
 	CBZ	R0, 2(PC)
 	BL	runtime·load_g(SB)
 
+#ifdef GOEXPERIMENT_regabiargs
+	// Restore signum to R0.
+	MOVW	8(RSP), R0
+	// R1 and R2 already contain info and ctx, respectively.
+#else
 	MOVD	R1, 16(RSP)
 	MOVD	R2, 24(RSP)
-	MOVD	$runtime·sigtrampgo(SB), R0
-	BL	(R0)
+#endif
+	MOVD	$runtime·sigtrampgo<ABIInternal>(SB), R3
+	BL	(R3)
 
 	// Restore callee-save registers.
 	RESTORE_R19_TO_R28(8*4)
@@ -476,10 +482,14 @@
 	SAVE_R19_TO_R28(8*4)
 	SAVE_F8_TO_F15(8*14)
 
+#ifdef GOEXPERIMENT_regabiargs
+	// R0, R1 and R2 already contain sig, info and ctx, respectively.
+#else
 	MOVW	R0, 8(RSP)	// sig
 	MOVD	R1, 16(RSP)	// info
 	MOVD	R2, 24(RSP)	// ctx
-	CALL	runtime·sigprofNonGo(SB)
+#endif
+	CALL	runtime·sigprofNonGo<ABIInternal>(SB)
 
 	// Restore callee-save registers.
 	RESTORE_R19_TO_R28(8*4)

diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s
index 41eddf3..ade1136 100644
--- a/src/runtime/sys_netbsd_amd64.s
+++ b/src/runtime/sys_netbsd_amd64.s

@@ -309,13 +309,21 @@
 	// Transition from C ABI to Go ABI.
 	PUSH_REGS_HOST_TO_ABI0()
 
-	// Call into the Go signal handler
+	// Set up ABIInternal environment: g in R14, cleared X15.
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	PXOR	X15, X15
+
+	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
-        ADJSP   $24
-	MOVQ	DI, 0(SP)	// sig
-	MOVQ	SI, 8(SP)	// info
-	MOVQ	DX, 16(SP)	// ctx
-	CALL	·sigtrampgo(SB)
+	ADJSP   $24
+
+	// Call into the Go signal handler
+	MOVQ	DI, AX	// sig
+	MOVQ	SI, BX	// info
+	MOVQ	DX, CX	// ctx
+	CALL	·sigtrampgo<ABIInternal>(SB)
+
 	ADJSP	$-24
 
         POP_REGS_HOST_TO_ABI0()

diff --git a/src/runtime/sys_netbsd_arm64.s b/src/runtime/sys_netbsd_arm64.s
index 32e6740..6bcd344 100644
--- a/src/runtime/sys_netbsd_arm64.s
+++ b/src/runtime/sys_netbsd_arm64.s

@@ -317,9 +317,15 @@
 	BEQ	2(PC)
 	BL	runtime·load_g(SB)
 
+#ifdef GOEXPERIMENT_regabiargs
+	// Restore signum to R0.
+	MOVW	8(RSP), R0
+	// R1 and R2 already contain info and ctx, respectively.
+#else
 	MOVD	R1, 16(RSP)
 	MOVD	R2, 24(RSP)
-	BL	runtime·sigtrampgo(SB)
+#endif
+	BL	runtime·sigtrampgo<ABIInternal>(SB)
 
 	// Restore callee-save registers.
 	RESTORE_R19_TO_R28(8*4)

diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s
index fc6d5dc..f71f5cc 100644
--- a/src/runtime/sys_openbsd_amd64.s
+++ b/src/runtime/sys_openbsd_amd64.s

@@ -62,16 +62,24 @@
 	// Transition from C ABI to Go ABI.
 	PUSH_REGS_HOST_TO_ABI0()
 
-	// Call into the Go signal handler
+	// Set up ABIInternal environment: g in R14, cleared X15.
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	PXOR	X15, X15
+
+	// Reserve space for spill slots.
 	NOP	SP		// disable vet stack checking
-        ADJSP   $24
-	MOVQ	DI, 0(SP)	// sig
-	MOVQ	SI, 8(SP)	// info
-	MOVQ	DX, 16(SP)	// ctx
-	CALL	·sigtrampgo(SB)
+	ADJSP   $24
+
+	// Call into the Go signal handler
+	MOVQ	DI, AX	// sig
+	MOVQ	SI, BX	// info
+	MOVQ	DX, CX	// ctx
+	CALL	·sigtrampgo<ABIInternal>(SB)
+
 	ADJSP	$-24
 
-        POP_REGS_HOST_TO_ABI0()
+	POP_REGS_HOST_TO_ABI0()
 	RET
 
 //

diff --git a/src/runtime/sys_openbsd_arm64.s b/src/runtime/sys_openbsd_arm64.s
index 7c1886e..4a3f2fc3 100644
--- a/src/runtime/sys_openbsd_arm64.s
+++ b/src/runtime/sys_openbsd_arm64.s

@@ -62,9 +62,15 @@
 	MOVW	R0, 8(RSP)		// signum
 	BL	runtime·load_g(SB)
 
+#ifdef GOEXPERIMENT_regabiargs
+	// Restore signum to R0.
+	MOVW	8(RSP), R0
+	// R1 and R2 already contain info and ctx, respectively.
+#else
 	MOVD	R1, 16(RSP)
 	MOVD	R2, 24(RSP)
-	BL	runtime·sigtrampgo(SB)
+#endif
+	BL	runtime·sigtrampgo<ABIInternal>(SB)
 
 	// Restore callee-save registers.
 	RESTORE_R19_TO_R28(8*4)

diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 8f60de2..b50e1b2 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go

@@ -461,12 +461,13 @@
 }
 
 // traceReader returns the trace reader that should be woken up, if any.
+// Callers should first check that trace.enabled or trace.shutdown is set.
 func traceReader() *g {
-	if trace.reader == 0 || (trace.fullHead == 0 && !trace.shutdown) {
+	if !traceReaderAvailable() {
 		return nil
 	}
 	lock(&trace.lock)
-	if trace.reader == 0 || (trace.fullHead == 0 && !trace.shutdown) {
+	if !traceReaderAvailable() {
 		unlock(&trace.lock)
 		return nil
 	}
@@ -476,6 +477,13 @@
 	return gp
 }
 
+// traceReaderAvailable returns true if the trace reader is not currently
+// scheduled and should be. Callers should first check that trace.enabled
+// or trace.shutdown is set.
+func traceReaderAvailable() bool {
+	return trace.reader != 0 && (trace.fullHead != 0 || trace.shutdown)
+}
+
 // traceProcFree frees trace buffer associated with pp.
 func traceProcFree(pp *p) {
 	buf := pp.tracebuf

diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 23bce2b..9187d1f 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go

@@ -470,7 +470,7 @@
 		}
 
 		waspanic = f.funcID == funcID_sigpanic
-		injectedCall := waspanic || f.funcID == funcID_asyncPreempt
+		injectedCall := waspanic || f.funcID == funcID_asyncPreempt || f.funcID == funcID_debugCallV2
 
 		// Do not unwind past the bottom of the stack.
 		if !flr.valid() {

diff --git a/src/sort/gen_sort_variants.go b/src/sort/gen_sort_variants.go
index 2b671ce..d738cac9 100644
--- a/src/sort/gen_sort_variants.go
+++ b/src/sort/gen_sort_variants.go

@@ -84,9 +84,6 @@
 			ExtraArg:   "",
 			DataType:   "[]E",
 			Funcs: template.FuncMap{
-				"GreaterOrEqual": func(name, i, j string) string {
-					return fmt.Sprintf("(%s[%s] >= %s[%s])", name, i, name, j)
-				},
 				"Less": func(name, i, j string) string {
 					return fmt.Sprintf("(%s[%s] < %s[%s])", name, i, name, j)
 				},
@@ -106,9 +103,6 @@
 			ExtraArg:   ", less",
 			DataType:   "[]E",
 			Funcs: template.FuncMap{
-				"GreaterOrEqual": func(name, i, j string) string {
-					return fmt.Sprintf("!less(%s[%s], %s[%s])", name, i, name, j)
-				},
 				"Less": func(name, i, j string) string {
 					return fmt.Sprintf("less(%s[%s], %s[%s])", name, i, name, j)
 				},
@@ -129,9 +123,6 @@
 			ExtraArg:   "",
 			DataType:   "Interface",
 			Funcs: template.FuncMap{
-				"GreaterOrEqual": func(name, i, j string) string {
-					return fmt.Sprintf("!%s.Less(%s, %s)", name, i, j)
-				},
 				"Less": func(name, i, j string) string {
 					return fmt.Sprintf("%s.Less(%s, %s)", name, i, j)
 				},
@@ -152,9 +143,6 @@
 			ExtraArg:   "",
 			DataType:   "lessSwap",
 			Funcs: template.FuncMap{
-				"GreaterOrEqual": func(name, i, j string) string {
-					return fmt.Sprintf("!%s.Less(%s, %s)", name, i, j)
-				},
 				"Less": func(name, i, j string) string {
 					return fmt.Sprintf("%s.Less(%s, %s)", name, i, j)
 				},
@@ -222,7 +210,7 @@
 		if child+1 < hi && {{Less "data" "first+child" "first+child+1"}} {
 			child++
 		}
-		if {{GreaterOrEqual "data" "first+root" "first+child"}} {
+		if !{{Less "data" "first+root" "first+child"}} {
 			return
 		}
 		{{Swap "data" "first+root" "first+child"}}
@@ -300,7 +288,7 @@
 
 		// Probably the slice contains many duplicate elements, partition the slice into
 		// elements equal to and elements greater than the pivot.
-		if a > 0 && {{GreaterOrEqual "data" "a-1" "pivot"}} {
+		if a > 0 && !{{Less "data" "a-1" "pivot"}} {
 			mid := partitionEqual{{.FuncSuffix}}(data, a, b, pivot {{.ExtraArg}})
 			a = mid
 			continue
@@ -334,7 +322,7 @@
 	for i <= j && {{Less "data" "i" "a"}} {
 		i++
 	}
-	for i <= j && {{GreaterOrEqual "data" "j" "a"}} {
+	for i <= j && !{{Less "data" "j" "a"}} {
 		j--
 	}
 	if i > j {
@@ -349,7 +337,7 @@
 		for i <= j && {{Less "data" "i" "a"}} {
 			i++
 		}
-		for i <= j && {{GreaterOrEqual "data" "j" "a"}} {
+		for i <= j && !{{Less "data" "j" "a"}} {
 			j--
 		}
 		if i > j {
@@ -370,7 +358,7 @@
 	i, j := a+1, b-1 // i and j are inclusive of the elements remaining to be partitioned
 
 	for {
-		for i <= j && {{GreaterOrEqual "data" "a" "i"}} {
+		for i <= j && !{{Less "data" "a" "i"}} {
 			i++
 		}
 		for i <= j && {{Less "data" "a" "j"}} {
@@ -394,7 +382,7 @@
 	)
 	i := a + 1
 	for j := 0; j < maxSteps; j++ {
-		for i < b && {{GreaterOrEqual "data" "i" "i-1"}} {
+		for i < b && !{{Less "data" "i" "i-1"}} {
 			i++
 		}
 
@@ -411,7 +399,7 @@
 		// Shift the smaller one to the left.
 		if i-a >= 2 {
 			for j := i - 1; j >= 1; j-- {
-				if {{GreaterOrEqual "data" "j" "j-1"}} {
+				if !{{Less "data" "j" "j-1"}} {
 					break
 				}
 				{{Swap "data" "j" "j-1"}}
@@ -420,7 +408,7 @@
 		// Shift the greater one to the right.
 		if b-i >= 2 {
 			for j := i + 1; j < b; j++ {
-				if {{GreaterOrEqual "data" "j" "j-1"}} {
+				if !{{Less "data" "j" "j-1"}} {
 					break
 				}
 				{{Swap "data" "j" "j-1"}}
@@ -606,7 +594,7 @@
 		j := m
 		for i < j {
 			h := int(uint(i+j) >> 1)
-			if {{GreaterOrEqual "data" "m" "h"}} {
+			if !{{Less "data" "m" "h"}} {
 				i = h + 1
 			} else {
 				j = h
@@ -633,7 +621,7 @@
 
 	for start < r {
 		c := int(uint(start+r) >> 1)
-		if {{GreaterOrEqual "data" "p-c" "c"}} {
+		if !{{Less "data" "p-c" "c"}} {
 			start = c + 1
 		} else {
 			r = c

diff --git a/src/syscall/asm_linux_386.s b/src/syscall/asm_linux_386.s
index 1c69083..e86a859 100644
--- a/src/syscall/asm_linux_386.s
+++ b/src/syscall/asm_linux_386.s

@@ -13,103 +13,6 @@
 // instead of the glibc-specific "CALL 0x10(GS)".
 #define INVOKE_SYSCALL	INT	$0x80
 
-// func Syscall(trap uintptr, a1, a2, a3 uintptr) (r1, r2, err uintptr);
-// Trap # in AX, args in BX CX DX SI DI, return in AX
-TEXT ·Syscall(SB),NOSPLIT,$0-28
-	CALL	runtime·entersyscall(SB)
-	MOVL	trap+0(FP), AX	// syscall entry
-	MOVL	a1+4(FP), BX
-	MOVL	a2+8(FP), CX
-	MOVL	a3+12(FP), DX
-	MOVL	$0, SI
-	MOVL	$0, DI
-	INVOKE_SYSCALL
-	CMPL	AX, $0xfffff001
-	JLS	ok
-	MOVL	$-1, r1+16(FP)
-	MOVL	$0, r2+20(FP)
-	NEGL	AX
-	MOVL	AX, err+24(FP)
-	CALL	runtime·exitsyscall(SB)
-	RET
-ok:
-	MOVL	AX, r1+16(FP)
-	MOVL	DX, r2+20(FP)
-	MOVL	$0, err+24(FP)
-	CALL	runtime·exitsyscall(SB)
-	RET
-
-// func Syscall6(trap uintptr, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr);
-TEXT ·Syscall6(SB),NOSPLIT,$0-40
-	CALL	runtime·entersyscall(SB)
-	MOVL	trap+0(FP), AX	// syscall entry
-	MOVL	a1+4(FP), BX
-	MOVL	a2+8(FP), CX
-	MOVL	a3+12(FP), DX
-	MOVL	a4+16(FP), SI
-	MOVL	a5+20(FP), DI
-	MOVL	a6+24(FP), BP
-	INVOKE_SYSCALL
-	CMPL	AX, $0xfffff001
-	JLS	ok6
-	MOVL	$-1, r1+28(FP)
-	MOVL	$0, r2+32(FP)
-	NEGL	AX
-	MOVL	AX, err+36(FP)
-	CALL	runtime·exitsyscall(SB)
-	RET
-ok6:
-	MOVL	AX, r1+28(FP)
-	MOVL	DX, r2+32(FP)
-	MOVL	$0, err+36(FP)
-	CALL	runtime·exitsyscall(SB)
-	RET
-
-// func RawSyscall(trap uintptr, a1, a2, a3 uintptr) (r1, r2, err uintptr);
-TEXT ·RawSyscall(SB),NOSPLIT,$0-28
-	MOVL	trap+0(FP), AX	// syscall entry
-	MOVL	a1+4(FP), BX
-	MOVL	a2+8(FP), CX
-	MOVL	a3+12(FP), DX
-	MOVL	$0, SI
-	MOVL	$0, DI
-	INVOKE_SYSCALL
-	CMPL	AX, $0xfffff001
-	JLS	ok1
-	MOVL	$-1, r1+16(FP)
-	MOVL	$0, r2+20(FP)
-	NEGL	AX
-	MOVL	AX, err+24(FP)
-	RET
-ok1:
-	MOVL	AX, r1+16(FP)
-	MOVL	DX, r2+20(FP)
-	MOVL	$0, err+24(FP)
-	RET
-
-// func RawSyscall6(trap uintptr, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr);
-TEXT ·RawSyscall6(SB),NOSPLIT,$0-40
-	MOVL	trap+0(FP), AX	// syscall entry
-	MOVL	a1+4(FP), BX
-	MOVL	a2+8(FP), CX
-	MOVL	a3+12(FP), DX
-	MOVL	a4+16(FP), SI
-	MOVL	a5+20(FP), DI
-	MOVL	a6+24(FP), BP
-	INVOKE_SYSCALL
-	CMPL	AX, $0xfffff001
-	JLS	ok2
-	MOVL	$-1, r1+28(FP)
-	MOVL	$0, r2+32(FP)
-	NEGL	AX
-	MOVL	AX, err+36(FP)
-	RET
-ok2:
-	MOVL	AX, r1+28(FP)
-	MOVL	DX, r2+32(FP)
-	MOVL	$0, err+36(FP)
-	RET
-
 // func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
 TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-16
 	MOVL	trap+0(FP), AX	// syscall entry

diff --git a/src/syscall/asm_linux_amd64.s b/src/syscall/asm_linux_amd64.s
index 0b55a30..3206a45 100644
--- a/src/syscall/asm_linux_amd64.s
+++ b/src/syscall/asm_linux_amd64.s

@@ -11,102 +11,6 @@
 
 #define SYS_gettimeofday 96
 
-// func Syscall(trap int64, a1, a2, a3 uintptr) (r1, r2, err uintptr);
-// Trap # in AX, args in DI SI DX R10 R8 R9, return in AX DX
-// Note that this differs from "standard" ABI convention, which
-// would pass 4th arg in CX, not R10.
-
-TEXT ·Syscall(SB),NOSPLIT,$0-56
-	CALL	runtime·entersyscall<ABIInternal>(SB)
-	MOVQ	a1+8(FP), DI
-	MOVQ	a2+16(FP), SI
-	MOVQ	a3+24(FP), DX
-	MOVQ	trap+0(FP), AX	// syscall entry
-	SYSCALL
-	CMPQ	AX, $0xfffffffffffff001
-	JLS	ok
-	MOVQ	$-1, r1+32(FP)
-	MOVQ	$0, r2+40(FP)
-	NEGQ	AX
-	MOVQ	AX, err+48(FP)
-	CALL	runtime·exitsyscall<ABIInternal>(SB)
-	RET
-ok:
-	MOVQ	AX, r1+32(FP)
-	MOVQ	DX, r2+40(FP)
-	MOVQ	$0, err+48(FP)
-	CALL	runtime·exitsyscall<ABIInternal>(SB)
-	RET
-
-// func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr)
-TEXT ·Syscall6(SB),NOSPLIT,$0-80
-	CALL	runtime·entersyscall<ABIInternal>(SB)
-	MOVQ	a1+8(FP), DI
-	MOVQ	a2+16(FP), SI
-	MOVQ	a3+24(FP), DX
-	MOVQ	a4+32(FP), R10
-	MOVQ	a5+40(FP), R8
-	MOVQ	a6+48(FP), R9
-	MOVQ	trap+0(FP), AX	// syscall entry
-	SYSCALL
-	CMPQ	AX, $0xfffffffffffff001
-	JLS	ok6
-	MOVQ	$-1, r1+56(FP)
-	MOVQ	$0, r2+64(FP)
-	NEGQ	AX
-	MOVQ	AX, err+72(FP)
-	CALL	runtime·exitsyscall<ABIInternal>(SB)
-	RET
-ok6:
-	MOVQ	AX, r1+56(FP)
-	MOVQ	DX, r2+64(FP)
-	MOVQ	$0, err+72(FP)
-	CALL	runtime·exitsyscall<ABIInternal>(SB)
-	RET
-
-// func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2, err uintptr)
-TEXT ·RawSyscall(SB),NOSPLIT,$0-56
-	MOVQ	a1+8(FP), DI
-	MOVQ	a2+16(FP), SI
-	MOVQ	a3+24(FP), DX
-	MOVQ	trap+0(FP), AX	// syscall entry
-	SYSCALL
-	CMPQ	AX, $0xfffffffffffff001
-	JLS	ok1
-	MOVQ	$-1, r1+32(FP)
-	MOVQ	$0, r2+40(FP)
-	NEGQ	AX
-	MOVQ	AX, err+48(FP)
-	RET
-ok1:
-	MOVQ	AX, r1+32(FP)
-	MOVQ	DX, r2+40(FP)
-	MOVQ	$0, err+48(FP)
-	RET
-
-// func RawSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr)
-TEXT ·RawSyscall6(SB),NOSPLIT,$0-80
-	MOVQ	a1+8(FP), DI
-	MOVQ	a2+16(FP), SI
-	MOVQ	a3+24(FP), DX
-	MOVQ	a4+32(FP), R10
-	MOVQ	a5+40(FP), R8
-	MOVQ	a6+48(FP), R9
-	MOVQ	trap+0(FP), AX	// syscall entry
-	SYSCALL
-	CMPQ	AX, $0xfffffffffffff001
-	JLS	ok2
-	MOVQ	$-1, r1+56(FP)
-	MOVQ	$0, r2+64(FP)
-	NEGQ	AX
-	MOVQ	AX, err+72(FP)
-	RET
-ok2:
-	MOVQ	AX, r1+56(FP)
-	MOVQ	DX, r2+64(FP)
-	MOVQ	$0, err+72(FP)
-	RET
-
 // func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
 TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
 	MOVQ	a1+8(FP), DI

diff --git a/src/syscall/asm_linux_arm.s b/src/syscall/asm_linux_arm.s
index 6bb4df8..3252220 100644
--- a/src/syscall/asm_linux_arm.s
+++ b/src/syscall/asm_linux_arm.s

@@ -9,95 +9,6 @@
 // System calls for arm, Linux
 //
 
-// func Syscall(syscall uintptr, a1, a2, a3 uintptr) (r1, r2, err uintptr);
-TEXT ·Syscall(SB),NOSPLIT,$0-28
-	BL	runtime·entersyscall(SB)
-	MOVW	trap+0(FP), R7
-	MOVW	a1+4(FP), R0
-	MOVW	a2+8(FP), R1
-	MOVW	a3+12(FP), R2
-	MOVW	$0, R3
-	MOVW	$0, R4
-	MOVW	$0, R5
-	SWI	$0
-	MOVW	$0xfffff001, R1
-	CMP	R1, R0
-	BLS	ok
-	MOVW	$-1, R1
-	MOVW	R1, r1+16(FP)
-	MOVW	$0, R2
-	MOVW	R2, r2+20(FP)
-	RSB	$0, R0, R0
-	MOVW	R0, err+24(FP)
-	BL	runtime·exitsyscall(SB)
-	RET
-ok:
-	MOVW	R0, r1+16(FP)
-	MOVW	$0, R0
-	MOVW	R0, r2+20(FP)
-	MOVW	R0, err+24(FP)
-	BL	runtime·exitsyscall(SB)
-	RET
-
-// func Syscall6(trap uintptr, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr);
-// Actually Syscall5 but the rest of the code expects it to be named Syscall6.
-TEXT ·Syscall6(SB),NOSPLIT,$0-40
-	BL	runtime·entersyscall(SB)
-	MOVW	trap+0(FP), R7	// syscall entry
-	MOVW	a1+4(FP), R0
-	MOVW	a2+8(FP), R1
-	MOVW	a3+12(FP), R2
-	MOVW	a4+16(FP), R3
-	MOVW	a5+20(FP), R4
-	MOVW	a6+24(FP), R5
-	SWI	$0
-	MOVW	$0xfffff001, R6
-	CMP	R6, R0
-	BLS	ok6
-	MOVW	$-1, R1
-	MOVW	R1, r1+28(FP)
-	MOVW	$0, R2
-	MOVW	R2, r2+32(FP)
-	RSB	$0, R0, R0
-	MOVW	R0, err+36(FP)
-	BL	runtime·exitsyscall(SB)
-	RET
-ok6:
-	MOVW	R0, r1+28(FP)
-	MOVW	R1, r2+32(FP)
-	MOVW	$0, R0
-	MOVW	R0, err+36(FP)
-	BL	runtime·exitsyscall(SB)
-	RET
-
-// func RawSyscall6(trap uintptr, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr);
-// Actually RawSyscall5 but the rest of the code expects it to be named RawSyscall6.
-TEXT	·RawSyscall6(SB),NOSPLIT,$0-40
-	MOVW	trap+0(FP), R7	// syscall entry
-	MOVW	a1+4(FP), R0
-	MOVW	a2+8(FP), R1
-	MOVW	a3+12(FP), R2
-	MOVW	a4+16(FP), R3
-	MOVW	a5+20(FP), R4
-	MOVW	a6+24(FP), R5
-	SWI	$0
-	MOVW	$0xfffff001, R6
-	CMP	R6, R0
-	BLS	ok2
-	MOVW	$-1, R1
-	MOVW	R1, r1+28(FP)
-	MOVW	$0, R2
-	MOVW	R2, r2+32(FP)
-	RSB	$0, R0, R0
-	MOVW	R0, err+36(FP)
-	RET
-ok2:
-	MOVW	R0, r1+28(FP)
-	MOVW	R1, r2+32(FP)
-	MOVW	$0, R0
-	MOVW	R0, err+36(FP)
-	RET
-
 #define SYS__LLSEEK 140  /* from zsysnum_linux_arm.go */
 // func seek(fd int, offset int64, whence int) (newoffset int64, errno int)
 // Implemented in assembly to avoid allocation when
@@ -130,30 +41,6 @@
 	BL	runtime·exitsyscall(SB)
 	RET
 
-// func RawSyscall(trap uintptr, a1, a2, a3 uintptr) (r1, r2, err uintptr);
-TEXT ·RawSyscall(SB),NOSPLIT,$0-28
-	MOVW	trap+0(FP), R7	// syscall entry
-	MOVW	a1+4(FP), R0
-	MOVW	a2+8(FP), R1
-	MOVW	a3+12(FP), R2
-	SWI	$0
-	MOVW	$0xfffff001, R1
-	CMP	R1, R0
-	BLS	ok1
-	MOVW	$-1, R1
-	MOVW	R1, r1+16(FP)
-	MOVW	$0, R2
-	MOVW	R2, r2+20(FP)
-	RSB	$0, R0, R0
-	MOVW	R0, err+24(FP)
-	RET
-ok1:
-	MOVW	R0, r1+16(FP)
-	MOVW	$0, R0
-	MOVW	R0, r2+20(FP)
-	MOVW	R0, err+24(FP)
-	RET
-
 // func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
 TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-16
 	MOVW	trap+0(FP), R7	// syscall entry

diff --git a/src/syscall/asm_linux_arm64.s b/src/syscall/asm_linux_arm64.s
index 6c50fa9..be78ac8 100644
--- a/src/syscall/asm_linux_arm64.s
+++ b/src/syscall/asm_linux_arm64.s

@@ -4,105 +4,6 @@
 
 #include "textflag.h"
 
-// func Syscall(trap int64, a1, a2, a3 int64) (r1, r2, err int64);
-TEXT ·Syscall(SB),NOSPLIT,$0-56
-	BL	runtime·entersyscall<ABIInternal>(SB)
-	MOVD	a1+8(FP), R0
-	MOVD	a2+16(FP), R1
-	MOVD	a3+24(FP), R2
-	MOVD	$0, R3
-	MOVD	$0, R4
-	MOVD	$0, R5
-	MOVD	trap+0(FP), R8	// syscall entry
-	SVC
-	CMN	$4095, R0
-	BCC	ok
-	MOVD	$-1, R4
-	MOVD	R4, r1+32(FP)	// r1
-	MOVD	ZR, r2+40(FP)	// r2
-	NEG	R0, R0
-	MOVD	R0, err+48(FP)	// errno
-	BL	runtime·exitsyscall<ABIInternal>(SB)
-	RET
-ok:
-	MOVD	R0, r1+32(FP)	// r1
-	MOVD	R1, r2+40(FP)	// r2
-	MOVD	ZR, err+48(FP)	// errno
-	BL	runtime·exitsyscall<ABIInternal>(SB)
-	RET
-
-TEXT ·Syscall6(SB),NOSPLIT,$0-80
-	BL	runtime·entersyscall<ABIInternal>(SB)
-	MOVD	a1+8(FP), R0
-	MOVD	a2+16(FP), R1
-	MOVD	a3+24(FP), R2
-	MOVD	a4+32(FP), R3
-	MOVD	a5+40(FP), R4
-	MOVD	a6+48(FP), R5
-	MOVD	trap+0(FP), R8	// syscall entry
-	SVC
-	CMN	$4095, R0
-	BCC	ok
-	MOVD	$-1, R4
-	MOVD	R4, r1+56(FP)	// r1
-	MOVD	ZR, r2+64(FP)	// r2
-	NEG	R0, R0
-	MOVD	R0, err+72(FP)	// errno
-	BL	runtime·exitsyscall<ABIInternal>(SB)
-	RET
-ok:
-	MOVD	R0, r1+56(FP)	// r1
-	MOVD	R1, r2+64(FP)	// r2
-	MOVD	ZR, err+72(FP)	// errno
-	BL	runtime·exitsyscall<ABIInternal>(SB)
-	RET
-
-TEXT ·RawSyscall(SB),NOSPLIT,$0-56
-	MOVD	a1+8(FP), R0
-	MOVD	a2+16(FP), R1
-	MOVD	a3+24(FP), R2
-	MOVD	$0, R3
-	MOVD	$0, R4
-	MOVD	$0, R5
-	MOVD	trap+0(FP), R8	// syscall entry
-	SVC
-	CMN	$4095, R0
-	BCC	ok
-	MOVD	$-1, R4
-	MOVD	R4, r1+32(FP)	// r1
-	MOVD	ZR, r2+40(FP)	// r2
-	NEG	R0, R0
-	MOVD	R0, err+48(FP)	// errno
-	RET
-ok:
-	MOVD	R0, r1+32(FP)	// r1
-	MOVD	R1, r2+40(FP)	// r2
-	MOVD	ZR, err+48(FP)	// errno
-	RET
-
-TEXT ·RawSyscall6(SB),NOSPLIT,$0-80
-	MOVD	a1+8(FP), R0
-	MOVD	a2+16(FP), R1
-	MOVD	a3+24(FP), R2
-	MOVD	a4+32(FP), R3
-	MOVD	a5+40(FP), R4
-	MOVD	a6+48(FP), R5
-	MOVD	trap+0(FP), R8	// syscall entry
-	SVC
-	CMN	$4095, R0
-	BCC	ok
-	MOVD	$-1, R4
-	MOVD	R4, r1+56(FP)	// r1
-	MOVD	ZR, r2+64(FP)	// r2
-	NEG	R0, R0
-	MOVD	R0, err+72(FP)	// errno
-	RET
-ok:
-	MOVD	R0, r1+56(FP)	// r1
-	MOVD	R1, r2+64(FP)	// r2
-	MOVD	ZR, err+72(FP)	// errno
-	RET
-
 // func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
 TEXT ·rawVforkSyscall(SB),NOSPLIT,$0-32
 	MOVD	a1+8(FP), R0

diff --git a/src/syscall/asm_linux_mips64x.s b/src/syscall/asm_linux_mips64x.s
index a75d0f7..fadf193 100644
--- a/src/syscall/asm_linux_mips64x.s
+++ b/src/syscall/asm_linux_mips64x.s

@@ -10,97 +10,6 @@
 // System calls for mips64, Linux
 //
 
-// func Syscall(trap int64, a1, a2, a3 int64) (r1, r2, err int64);
-TEXT ·Syscall(SB),NOSPLIT,$0-56
-	JAL	runtime·entersyscall(SB)
-	MOVV	a1+8(FP), R4
-	MOVV	a2+16(FP), R5
-	MOVV	a3+24(FP), R6
-	MOVV	R0, R7
-	MOVV	R0, R8
-	MOVV	R0, R9
-	MOVV	trap+0(FP), R2	// syscall entry
-	SYSCALL
-	BEQ	R7, ok
-	MOVV	$-1, R1
-	MOVV	R1, r1+32(FP)	// r1
-	MOVV	R0, r2+40(FP)	// r2
-	MOVV	R2, err+48(FP)	// errno
-	JAL	runtime·exitsyscall(SB)
-	RET
-ok:
-	MOVV	R2, r1+32(FP)	// r1
-	MOVV	R3, r2+40(FP)	// r2
-	MOVV	R0, err+48(FP)	// errno
-	JAL	runtime·exitsyscall(SB)
-	RET
-
-TEXT ·Syscall6(SB),NOSPLIT,$0-80
-	JAL	runtime·entersyscall(SB)
-	MOVV	a1+8(FP), R4
-	MOVV	a2+16(FP), R5
-	MOVV	a3+24(FP), R6
-	MOVV	a4+32(FP), R7
-	MOVV	a5+40(FP), R8
-	MOVV	a6+48(FP), R9
-	MOVV	trap+0(FP), R2	// syscall entry
-	SYSCALL
-	BEQ	R7, ok6
-	MOVV	$-1, R1
-	MOVV	R1, r1+56(FP)	// r1
-	MOVV	R0, r2+64(FP)	// r2
-	MOVV	R2, err+72(FP)	// errno
-	JAL	runtime·exitsyscall(SB)
-	RET
-ok6:
-	MOVV	R2, r1+56(FP)	// r1
-	MOVV	R3, r2+64(FP)	// r2
-	MOVV	R0, err+72(FP)	// errno
-	JAL	runtime·exitsyscall(SB)
-	RET
-
-TEXT ·RawSyscall(SB),NOSPLIT,$0-56
-	MOVV	a1+8(FP), R4
-	MOVV	a2+16(FP), R5
-	MOVV	a3+24(FP), R6
-	MOVV	R0, R7
-	MOVV	R0, R8
-	MOVV	R0, R9
-	MOVV	trap+0(FP), R2	// syscall entry
-	SYSCALL
-	BEQ	R7, ok1
-	MOVV	$-1, R1
-	MOVV	R1, r1+32(FP)	// r1
-	MOVV	R0, r2+40(FP)	// r2
-	MOVV	R2, err+48(FP)	// errno
-	RET
-ok1:
-	MOVV	R2, r1+32(FP)	// r1
-	MOVV	R3, r2+40(FP)	// r2
-	MOVV	R0, err+48(FP)	// errno
-	RET
-
-TEXT ·RawSyscall6(SB),NOSPLIT,$0-80
-	MOVV	a1+8(FP), R4
-	MOVV	a2+16(FP), R5
-	MOVV	a3+24(FP), R6
-	MOVV	a4+32(FP), R7
-	MOVV	a5+40(FP), R8
-	MOVV	a6+48(FP), R9
-	MOVV	trap+0(FP), R2	// syscall entry
-	SYSCALL
-	BEQ	R7, ok2
-	MOVV	$-1, R1
-	MOVV	R1, r1+56(FP)	// r1
-	MOVV	R0, r2+64(FP)	// r2
-	MOVV	R2, err+72(FP)	// errno
-	RET
-ok2:
-	MOVV	R2, r1+56(FP)	// r1
-	MOVV	R3, r2+64(FP)	// r2
-	MOVV	R0, err+72(FP)	// errno
-	RET
-
 // func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
 TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
 	MOVV	a1+8(FP), R4

diff --git a/src/syscall/asm_linux_mipsx.s b/src/syscall/asm_linux_mipsx.s
index 04f90f6..b8cae96 100644
--- a/src/syscall/asm_linux_mipsx.s
+++ b/src/syscall/asm_linux_mipsx.s

@@ -11,59 +11,6 @@
 // System calls for mips, Linux
 //
 
-// func Syscall(trap uintptr, a1, a2, a3 uintptr) (r1, r2, err uintptr);
-TEXT ·Syscall(SB),NOSPLIT,$0-28
-	JAL	runtime·entersyscall(SB)
-	MOVW	a1+4(FP), R4
-	MOVW	a2+8(FP), R5
-	MOVW	a3+12(FP), R6
-	MOVW	R0, R7
-	MOVW	trap+0(FP), R2	// syscall entry
-	SYSCALL
-	BEQ	R7, ok
-	MOVW	$-1, R1
-	MOVW	R1, r1+16(FP)	// r1
-	MOVW	R0, r2+20(FP)	// r2
-	MOVW	R2, err+24(FP)	// errno
-	JAL	runtime·exitsyscall(SB)
-	RET
-ok:
-	MOVW	R2, r1+16(FP)	// r1
-	MOVW	R3, r2+20(FP)	// r2
-	MOVW	R0, err+24(FP)	// errno
-	JAL	runtime·exitsyscall(SB)
-	RET
-
-// func Syscall6(trap trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr);
-// 5th and 6th arg go at sp+16, sp+20.
-// Note that frame size of 20 means that 24 bytes gets reserved on stack.
-TEXT ·Syscall6(SB),NOSPLIT,$20-40
-	NO_LOCAL_POINTERS
-	JAL	runtime·entersyscall(SB)
-	MOVW	a1+4(FP), R4
-	MOVW	a2+8(FP), R5
-	MOVW	a3+12(FP), R6
-	MOVW	a4+16(FP), R7
-	MOVW	a5+20(FP), R8
-	MOVW	a6+24(FP), R9
-	MOVW	R8, 16(R29)
-	MOVW	R9, 20(R29)
-	MOVW	trap+0(FP), R2	// syscall entry
-	SYSCALL
-	BEQ	R7, ok6
-	MOVW	$-1, R1
-	MOVW	R1, r1+28(FP)	// r1
-	MOVW	R0, r2+32(FP)	// r2
-	MOVW	R2, err+36(FP)	// errno
-	JAL	runtime·exitsyscall(SB)
-	RET
-ok6:
-	MOVW	R2, r1+28(FP)	// r1
-	MOVW	R3, r2+32(FP)	// r2
-	MOVW	R0, err+36(FP)	// errno
-	JAL	runtime·exitsyscall(SB)
-	RET
-
 // func Syscall9(trap trap, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2, err uintptr);
 // Actually Syscall8 but the rest of the code expects it to be named Syscall9.
 TEXT ·Syscall9(SB),NOSPLIT,$28-52
@@ -97,47 +44,6 @@
 	JAL	runtime·exitsyscall(SB)
 	RET
 
-TEXT ·RawSyscall(SB),NOSPLIT,$24-28
-	MOVW	a1+4(FP), R4
-	MOVW	a2+8(FP), R5
-	MOVW	a3+12(FP), R6
-	MOVW	trap+0(FP), R2	// syscall entry
-	SYSCALL
-	BEQ	R7, ok1
-	MOVW	$-1, R1
-	MOVW	R1, r1+16(FP)	// r1
-	MOVW	R0, r2+20(FP)	// r2
-	MOVW	R2, err+24(FP)	// errno
-	RET
-ok1:
-	MOVW	R2, r1+16(FP)	// r1
-	MOVW	R3, r2+20(FP)	// r2
-	MOVW	R0, err+24(FP)	// errno
-	RET
-
-TEXT ·RawSyscall6(SB),NOSPLIT,$20-40
-	MOVW	a1+4(FP), R4
-	MOVW	a2+8(FP), R5
-	MOVW	a3+12(FP), R6
-	MOVW	a4+16(FP), R7
-	MOVW	a5+20(FP), R8
-	MOVW	a6+24(FP), R9
-	MOVW	R8, 16(R29)
-	MOVW	R9, 20(R29)
-	MOVW	trap+0(FP), R2	// syscall entry
-	SYSCALL
-	BEQ	R7, ok2
-	MOVW	$-1, R1
-	MOVW	R1, r1+28(FP)	// r1
-	MOVW	R0, r2+32(FP)	// r2
-	MOVW	R2, err+36(FP)	// errno
-	RET
-ok2:
-	MOVW	R2, r1+28(FP)	// r1
-	MOVW	R3, r2+32(FP)	// r2
-	MOVW	R0, err+36(FP)	// errno
-	RET
-
 // func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
 TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-16
 	MOVW	a1+4(FP), R4

diff --git a/src/syscall/asm_linux_ppc64x.s b/src/syscall/asm_linux_ppc64x.s
index bf701e5..89cc1c2 100644
--- a/src/syscall/asm_linux_ppc64x.s
+++ b/src/syscall/asm_linux_ppc64x.s

@@ -10,97 +10,6 @@
 // System calls for ppc64, Linux
 //
 
-// func Syscall(trap int64, a1, a2, a3 int64) (r1, r2, err int64);
-TEXT ·Syscall(SB),NOSPLIT,$0-56
-	BL	runtime·entersyscall<ABIInternal>(SB)
-	MOVD	a1+8(FP), R3
-	MOVD	a2+16(FP), R4
-	MOVD	a3+24(FP), R5
-	MOVD	R0, R6
-	MOVD	R0, R7
-	MOVD	R0, R8
-	MOVD	trap+0(FP), R9	// syscall entry
-	SYSCALL R9
-	BVC	ok
-	MOVD	$-1, R4
-	MOVD	R4, r1+32(FP)	// r1
-	MOVD	R0, r2+40(FP)	// r2
-	MOVD	R3, err+48(FP)	// errno
-	BL	runtime·exitsyscall<ABIInternal>(SB)
-	RET
-ok:
-	MOVD	R3, r1+32(FP)	// r1
-	MOVD	R0, r2+40(FP)	// r2
-	MOVD	R0, err+48(FP)	// errno
-	BL	runtime·exitsyscall<ABIInternal>(SB)
-	RET
-
-TEXT ·Syscall6(SB),NOSPLIT,$0-80
-	BL	runtime·entersyscall<ABIInternal>(SB)
-	MOVD	a1+8(FP), R3
-	MOVD	a2+16(FP), R4
-	MOVD	a3+24(FP), R5
-	MOVD	a4+32(FP), R6
-	MOVD	a5+40(FP), R7
-	MOVD	a6+48(FP), R8
-	MOVD	trap+0(FP), R9	// syscall entry
-	SYSCALL R9
-	BVC	ok6
-	MOVD	$-1, R4
-	MOVD	R4, r1+56(FP)	// r1
-	MOVD	R0, r2+64(FP)	// r2
-	MOVD	R3, err+72(FP)	// errno
-	BL	runtime·exitsyscall<ABIInternal>(SB)
-	RET
-ok6:
-	MOVD	R3, r1+56(FP)	// r1
-	MOVD	R0, r2+64(FP)	// r2
-	MOVD	R0, err+72(FP)	// errno
-	BL	runtime·exitsyscall<ABIInternal>(SB)
-	RET
-
-TEXT ·RawSyscall(SB),NOSPLIT,$0-56
-	MOVD	a1+8(FP), R3
-	MOVD	a2+16(FP), R4
-	MOVD	a3+24(FP), R5
-	MOVD	R0, R6
-	MOVD	R0, R7
-	MOVD	R0, R8
-	MOVD	trap+0(FP), R9	// syscall entry
-	SYSCALL R9
-	BVC	ok1
-	MOVD	$-1, R4
-	MOVD	R4, r1+32(FP)	// r1
-	MOVD	R0, r2+40(FP)	// r2
-	MOVD	R3, err+48(FP)	// errno
-	RET
-ok1:
-	MOVD	R3, r1+32(FP)	// r1
-	MOVD	R0, r2+40(FP)	// r2
-	MOVD	R0, err+48(FP)	// errno
-	RET
-
-TEXT ·RawSyscall6(SB),NOSPLIT,$0-80
-	MOVD	a1+8(FP), R3
-	MOVD	a2+16(FP), R4
-	MOVD	a3+24(FP), R5
-	MOVD	a4+32(FP), R6
-	MOVD	a5+40(FP), R7
-	MOVD	a6+48(FP), R8
-	MOVD	trap+0(FP), R9	// syscall entry
-	SYSCALL R9
-	BVC	ok2
-	MOVD	$-1, R4
-	MOVD	R4, r1+56(FP)	// r1
-	MOVD	R0, r2+64(FP)	// r2
-	MOVD	R3, err+72(FP)	// errno
-	RET
-ok2:
-	MOVD	R3, r1+56(FP)	// r1
-	MOVD	R0, r2+64(FP)	// r2
-	MOVD	R0, err+72(FP)	// errno
-	RET
-
 // func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
 TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
 	MOVD	a1+8(FP), R3

diff --git a/src/syscall/asm_linux_riscv64.s b/src/syscall/asm_linux_riscv64.s
index f172dd3..0fc1f73 100644
--- a/src/syscall/asm_linux_riscv64.s
+++ b/src/syscall/asm_linux_riscv64.s

@@ -8,102 +8,6 @@
 // System calls for riscv64, Linux
 //
 
-// func Syscall(trap int64, a1, a2, a3 int64) (r1, r2, err int64)
-TEXT ·Syscall(SB),NOSPLIT,$0-56
-	CALL	runtime·entersyscall(SB)
-	MOV	a1+8(FP), A0
-	MOV	a2+16(FP), A1
-	MOV	a3+24(FP), A2
-	MOV	trap+0(FP), A7	// syscall entry
-	ECALL
-	MOV	$-4096, T0
-	BLTU	T0, A0, err
-	MOV	A0, r1+32(FP)	// r1
-	MOV	A1, r2+40(FP)	// r2
-	MOV	ZERO, err+48(FP)	// errno
-	CALL	runtime·exitsyscall(SB)
-	RET
-err:
-	MOV	$-1, T0
-	MOV	T0, r1+32(FP)	// r1
-	MOV	ZERO, r2+40(FP)	// r2
-	SUB	A0, ZERO, A0
-	MOV	A0, err+48(FP)	// errno
-	CALL	runtime·exitsyscall(SB)
-	RET
-
-// func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr)
-TEXT ·Syscall6(SB),NOSPLIT,$0-80
-	CALL	runtime·entersyscall(SB)
-	MOV	a1+8(FP), A0
-	MOV	a2+16(FP), A1
-	MOV	a3+24(FP), A2
-	MOV	a4+32(FP), A3
-	MOV	a5+40(FP), A4
-	MOV	a6+48(FP), A5
-	MOV	trap+0(FP), A7	// syscall entry
-	ECALL
-	MOV	$-4096, T0
-	BLTU	T0, A0, err
-	MOV	A0, r1+56(FP)	// r1
-	MOV	A1, r2+64(FP)	// r2
-	MOV	ZERO, err+72(FP)	// errno
-	CALL	runtime·exitsyscall(SB)
-	RET
-err:
-	MOV	$-1, T0
-	MOV	T0, r1+56(FP)	// r1
-	MOV	ZERO, r2+64(FP)	// r2
-	SUB	A0, ZERO, A0
-	MOV	A0, err+72(FP)	// errno
-	CALL	runtime·exitsyscall(SB)
-	RET
-
-// func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2, err uintptr)
-TEXT ·RawSyscall(SB),NOSPLIT,$0-56
-	MOV	a1+8(FP), A0
-	MOV	a2+16(FP), A1
-	MOV	a3+24(FP), A2
-	MOV	trap+0(FP), A7	// syscall entry
-	ECALL
-	MOV	$-4096, T0
-	BLTU	T0, A0, err
-	MOV	A0, r1+32(FP)	// r1
-	MOV	A1, r2+40(FP)	// r2
-	MOV	ZERO, err+48(FP)	// errno
-	RET
-err:
-	MOV	$-1, T0
-	MOV	T0, r1+32(FP)	// r1
-	MOV	ZERO, r2+40(FP)	// r2
-	SUB	A0, ZERO, A0
-	MOV	A0, err+48(FP)	// errno
-	RET
-
-// func RawSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr)
-TEXT ·RawSyscall6(SB),NOSPLIT,$0-80
-	MOV	a1+8(FP), A0
-	MOV	a2+16(FP), A1
-	MOV	a3+24(FP), A2
-	MOV	a4+32(FP), A3
-	MOV	a5+40(FP), A4
-	MOV	a6+48(FP), A5
-	MOV	trap+0(FP), A7	// syscall entry
-	ECALL
-	MOV	$-4096, T0
-	BLTU	T0, A0, err
-	MOV	A0, r1+56(FP)	// r1
-	MOV	A1, r2+64(FP)	// r2
-	MOV	ZERO, err+72(FP)	// errno
-	RET
-err:
-	MOV	$-1, T0
-	MOV	T0, r1+56(FP)	// r1
-	MOV	ZERO, r2+64(FP)	// r2
-	SUB	A0, ZERO, A0
-	MOV	A0, err+72(FP)	// errno
-	RET
-
 // func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
 TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
 	MOV	a1+8(FP), A0

diff --git a/src/syscall/asm_linux_s390x.s b/src/syscall/asm_linux_s390x.s
index 86a5c51..c3631c1 100644
--- a/src/syscall/asm_linux_s390x.s
+++ b/src/syscall/asm_linux_s390x.s

@@ -8,104 +8,6 @@
 // System calls for s390x, Linux
 //
 
-// func Syscall(trap int64, a1, a2, a3 int64) (r1, r2, err int64)
-TEXT ·Syscall(SB),NOSPLIT,$0-56
-	BL	runtime·entersyscall(SB)
-	MOVD	a1+8(FP), R2
-	MOVD	a2+16(FP), R3
-	MOVD	a3+24(FP), R4
-	MOVD	$0, R5
-	MOVD	$0, R6
-	MOVD	$0, R7
-	MOVD	trap+0(FP), R1	// syscall entry
-	SYSCALL
-	MOVD	$0xfffffffffffff001, R8
-	CMPUBLT	R2, R8, ok
-	MOVD	$-1, r1+32(FP)
-	MOVD	$0, r2+40(FP)
-	NEG	R2, R2
-	MOVD	R2, err+48(FP)	// errno
-	BL	runtime·exitsyscall(SB)
-	RET
-ok:
-	MOVD	R2, r1+32(FP)
-	MOVD	R3, r2+40(FP)
-	MOVD	$0, err+48(FP)	// errno
-	BL	runtime·exitsyscall(SB)
-	RET
-
-// func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr)
-TEXT ·Syscall6(SB),NOSPLIT,$0-80
-	BL	runtime·entersyscall(SB)
-	MOVD	a1+8(FP), R2
-	MOVD	a2+16(FP), R3
-	MOVD	a3+24(FP), R4
-	MOVD	a4+32(FP), R5
-	MOVD	a5+40(FP), R6
-	MOVD	a6+48(FP), R7
-	MOVD	trap+0(FP), R1	// syscall entry
-	SYSCALL
-	MOVD	$0xfffffffffffff001, R8
-	CMPUBLT	R2, R8, ok6
-	MOVD	$-1, r1+56(FP)
-	MOVD	$0, r2+64(FP)
-	NEG	R2, R2
-	MOVD	R2, err+72(FP)	// errno
-	BL	runtime·exitsyscall(SB)
-	RET
-ok6:
-	MOVD	R2, r1+56(FP)
-	MOVD	R3, r2+64(FP)
-	MOVD	$0, err+72(FP)	// errno
-	BL	runtime·exitsyscall(SB)
-	RET
-
-// func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2, err uintptr)
-TEXT ·RawSyscall(SB),NOSPLIT,$0-56
-	MOVD	a1+8(FP), R2
-	MOVD	a2+16(FP), R3
-	MOVD	a3+24(FP), R4
-	MOVD	$0, R5
-	MOVD	$0, R6
-	MOVD	$0, R7
-	MOVD	trap+0(FP), R1	// syscall entry
-	SYSCALL
-	MOVD	$0xfffffffffffff001, R8
-	CMPUBLT	R2, R8, ok1
-	MOVD	$-1, r1+32(FP)
-	MOVD	$0, r2+40(FP)
-	NEG	R2, R2
-	MOVD	R2, err+48(FP)	// errno
-	RET
-ok1:
-	MOVD	R2, r1+32(FP)
-	MOVD	R3, r2+40(FP)
-	MOVD	$0, err+48(FP)	// errno
-	RET
-
-// func RawSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr)
-TEXT ·RawSyscall6(SB),NOSPLIT,$0-80
-	MOVD	a1+8(FP), R2
-	MOVD	a2+16(FP), R3
-	MOVD	a3+24(FP), R4
-	MOVD	a4+32(FP), R5
-	MOVD	a5+40(FP), R6
-	MOVD	a6+48(FP), R7
-	MOVD	trap+0(FP), R1	// syscall entry
-	SYSCALL
-	MOVD	$0xfffffffffffff001, R8
-	CMPUBLT	R2, R8, ok2
-	MOVD	$-1, r1+56(FP)
-	MOVD	$0, r2+64(FP)
-	NEG	R2, R2
-	MOVD	R2, err+72(FP)	// errno
-	RET
-ok2:
-	MOVD	R2, r1+56(FP)
-	MOVD	R3, r2+64(FP)
-	MOVD	$0, err+72(FP)	// errno
-	RET
-
 // func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
 TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
 	MOVD	$0, R2

diff --git a/src/syscall/syscall_aix.go b/src/syscall/syscall_aix.go
index a2f8c78..6934241 100644
--- a/src/syscall/syscall_aix.go
+++ b/src/syscall/syscall_aix.go

@@ -15,6 +15,11 @@
 	"unsafe"
 )
 
+func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+
 // Implemented in runtime/syscall_aix.go.
 func rawSyscall6(trap, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
 func syscall6(trap, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)

diff --git a/src/syscall/syscall_darwin.go b/src/syscall/syscall_darwin.go
index 2e5387a..663bd98 100644
--- a/src/syscall/syscall_darwin.go
+++ b/src/syscall/syscall_darwin.go

@@ -17,6 +17,11 @@
 	"unsafe"
 )
 
+func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+
 var dupTrampoline = abi.FuncPCABI0(libc_dup2_trampoline)
 
 type SockaddrDatalink struct {

diff --git a/src/syscall/syscall_dragonfly.go b/src/syscall/syscall_dragonfly.go
index d8edca9..3628895 100644
--- a/src/syscall/syscall_dragonfly.go
+++ b/src/syscall/syscall_dragonfly.go

@@ -17,6 +17,11 @@
 	"unsafe"
 )
 
+func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+
 const _SYS_DUP3 = 0
 
 // See version list in https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/sys/sys/param.h

diff --git a/src/syscall/syscall_freebsd.go b/src/syscall/syscall_freebsd.go
index 8494b21..0637215 100644
--- a/src/syscall/syscall_freebsd.go
+++ b/src/syscall/syscall_freebsd.go

@@ -17,6 +17,11 @@
 	"unsafe"
 )
 
+func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+
 const (
 	_SYS_FSTAT_FREEBSD12         = 551 // { int fstat(int fd, _Out_ struct stat *sb); }
 	_SYS_FSTATAT_FREEBSD12       = 552 // { int fstatat(int fd, _In_z_ char *path, _Out_ struct stat *buf, int flag); }

diff --git a/src/syscall/syscall_linux.go b/src/syscall/syscall_linux.go
index 74322ca..f9adecd 100644
--- a/src/syscall/syscall_linux.go
+++ b/src/syscall/syscall_linux.go

@@ -16,6 +16,78 @@
 	"unsafe"
 )
 
+// N.B. RawSyscall6 is provided via linkname by runtime/internal/syscall.
+//
+// Errno is uintptr and thus compatible with the runtime/internal/syscall
+// definition.
+
+func RawSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+
+// Pull in entersyscall/exitsyscall for Syscall/Syscall6.
+//
+// Note that this can't be a push linkname because the runtime already has a
+// nameless linkname to export to assembly here and in x/sys. Additionally,
+// entersyscall fetches the caller PC and SP and thus can't have a wrapper
+// inbetween.
+
+//go:linkname runtime_entersyscall runtime.entersyscall
+func runtime_entersyscall()
+//go:linkname runtime_exitsyscall runtime.exitsyscall
+func runtime_exitsyscall()
+
+// N.B. For the Syscall functions below:
+//
+// //go:uintptrkeepalive because the uintptr argument may be converted pointers
+// that need to be kept alive in the caller (this is implied for RawSyscall6
+// since it has no body).
+//
+// //go:nosplit because stack copying does not account for uintptrkeepalive, so
+// the stack must not grow. Stack copying cannot blindly assume that all
+// uintptr arguments are pointers, because some values may look like pointers,
+// but not really be pointers, and adjusting their value would break the call.
+//
+// //go:linkname to ensure ABI wrappers are generated for external callers
+// (notably x/sys/unix assembly).
+
+//go:uintptrkeepalive
+//go:nosplit
+//go:linkname RawSyscall
+func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno) {
+	return RawSyscall6(trap, a1, a2, a3, 0, 0, 0)
+}
+
+//go:uintptrkeepalive
+//go:nosplit
+//go:linkname Syscall
+func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno) {
+	runtime_entersyscall()
+	// N.B. Calling RawSyscall here is unsafe with atomic coverage
+	// instrumentation and race mode.
+	//
+	// Coverage instrumentation will add a sync/atomic call to RawSyscall.
+	// Race mode will add race instrumentation to sync/atomic. Race
+	// instrumentation requires a P, which we no longer have.
+	//
+	// RawSyscall6 is fine because it is implemented in assembly and thus
+	// has no coverage instrumentation.
+	//
+	// This is typically not a problem in the runtime because cmd/go avoids
+	// adding coverage instrumentation to the runtime in race mode.
+	r1, r2, err = RawSyscall6(trap, a1, a2, a3, 0, 0, 0)
+	runtime_exitsyscall()
+	return
+}
+
+//go:uintptrkeepalive
+//go:nosplit
+//go:linkname Syscall6
+func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno) {
+	runtime_entersyscall()
+	r1, r2, err = RawSyscall6(trap, a1, a2, a3, a4, a5, a6)
+	runtime_exitsyscall()
+	return
+}
+
 func rawSyscallNoError(trap, a1, a2, a3 uintptr) (r1, r2 uintptr)
 
 /*

diff --git a/src/syscall/syscall_netbsd.go b/src/syscall/syscall_netbsd.go
index 9ccb66c..07afa07 100644
--- a/src/syscall/syscall_netbsd.go
+++ b/src/syscall/syscall_netbsd.go

@@ -14,6 +14,12 @@
 
 import "unsafe"
 
+func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+func Syscall9(num, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+
 const _SYS_DUP3 = SYS_DUP3
 
 type SockaddrDatalink struct {
@@ -28,8 +34,6 @@
 	raw    RawSockaddrDatalink
 }
 
-func Syscall9(num, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, err Errno)
-
 func sysctlNodes(mib []_C_int) (nodes []Sysctlnode, err error) {
 	var olen uintptr
 

diff --git a/src/syscall/syscall_openbsd.go b/src/syscall/syscall_openbsd.go
index 1d82351..19cf1f4 100644
--- a/src/syscall/syscall_openbsd.go
+++ b/src/syscall/syscall_openbsd.go

@@ -14,6 +14,12 @@
 
 import "unsafe"
 
+func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+func Syscall9(num, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+
 type SockaddrDatalink struct {
 	Len    uint8
 	Family uint8
@@ -26,8 +32,6 @@
 	raw    RawSockaddrDatalink
 }
 
-func Syscall9(num, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, err Errno)
-
 func nametomib(name string) (mib []_C_int, err error) {
 	// Perform lookup via a binary search
 	left := 0

diff --git a/src/syscall/syscall_solaris.go b/src/syscall/syscall_solaris.go
index 37ce5c9..e591ae6 100644
--- a/src/syscall/syscall_solaris.go
+++ b/src/syscall/syscall_solaris.go

@@ -14,6 +14,11 @@
 
 import "unsafe"
 
+func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
+func RawSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+
 const _F_DUP2FD_CLOEXEC = F_DUP2FD_CLOEXEC
 
 // Implemented in asm_solaris_amd64.s.

diff --git a/src/syscall/syscall_unix.go b/src/syscall/syscall_unix.go
index e12f024..cf0e238 100644
--- a/src/syscall/syscall_unix.go
+++ b/src/syscall/syscall_unix.go

@@ -28,11 +28,6 @@
 	netbsd32Bit = runtime.GOOS == "netbsd" && sizeofPtr == 4
 )
 
-func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
-func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
-func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno)
-func RawSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
-
 // clen returns the index of the first NULL byte in n or len(n) if n contains no NULL byte.
 func clen(n []byte) int {
 	if i := bytealg.IndexByte(n, 0); i != -1 {

diff --git a/src/time/example_test.go b/src/time/example_test.go
index ec2259b..059c631 100644
--- a/src/time/example_test.go
+++ b/src/time/example_test.go

@@ -212,6 +212,11 @@
 		panic(err)
 	}
 
+	tz, err := time.LoadLocation("Asia/Shanghai")
+	if err != nil { // Always check errors even if they should not happen.
+		panic(err)
+	}
+
 	// time.Time's Stringer method is useful without any format.
 	fmt.Println("default format:", t)
 
@@ -221,6 +226,10 @@
 	// The time zone attached to the time value affects its output.
 	fmt.Println("Same, in UTC:", t.UTC().Format(time.UnixDate))
 
+	fmt.Println("in Shanghai with seconds:", t.In(tz).Format("2006-01-02T15:04:05 -070000"))
+
+	fmt.Println("in Shanghai with colon seconds:", t.In(tz).Format("2006-01-02T15:04:05 -07:00:00"))
+
 	// The rest of this function demonstrates the properties of the
 	// layout string used in the format.
 
@@ -286,6 +295,8 @@
 	// default format: 2015-02-25 11:06:39 -0800 PST
 	// Unix format: Wed Feb 25 11:06:39 PST 2015
 	// Same, in UTC: Wed Feb 25 19:06:39 UTC 2015
+	//in Shanghai with seconds: 2015-02-26T03:06:39 +080000
+	//in Shanghai with colon seconds: 2015-02-26T03:06:39 +08:00:00
 	//
 	// Formats:
 	//

diff --git a/src/time/format.go b/src/time/format.go
index 2f66df6..5f69618 100644
--- a/src/time/format.go
+++ b/src/time/format.go

@@ -64,17 +64,21 @@
 //
 // Numeric time zone offsets format as follows:
 //
-//	"-0700"  ±hhmm
-//	"-07:00" ±hh:mm
-//	"-07"    ±hh
+//	"-0700"     ±hhmm
+//	"-07:00"    ±hh:mm
+//	"-07"       ±hh
+//	"-070000"   ±hhmmss
+//	"-07:00:00" ±hh:mm:ss
 //
 // Replacing the sign in the format with a Z triggers
 // the ISO 8601 behavior of printing Z instead of an
 // offset for the UTC zone. Thus:
 //
-//	"Z0700"  Z or ±hhmm
-//	"Z07:00" Z or ±hh:mm
-//	"Z07"    Z or ±hh
+//	"Z0700"      Z or ±hhmm
+//	"Z07:00"     Z or ±hh:mm
+//	"Z07"        Z or ±hh
+//	"Z070000"    Z or ±hhmmss
+//	"Z07:00:00"  Z or ±hh:mm:ss
 //
 // Within the format string, the underscores in "_2" and "__2" represent spaces
 // that may be replaced by digits if the following number has multiple digits,

diff --git a/src/time/time_test.go b/src/time/time_test.go
index 1701401..695d48b 100644
--- a/src/time/time_test.go
+++ b/src/time/time_test.go

@@ -281,6 +281,8 @@
 	b1e9.SetInt64(1e9)
 
 	testOne := func(ti, tns, di int64) bool {
+		t.Helper()
+
 		t0 := Unix(ti, int64(tns)).UTC()
 		d := Duration(di)
 		if d < 0 {
@@ -367,6 +369,13 @@
 		for i := 0; i < int(b); i++ {
 			d *= 5
 		}
+
+		// Make room for unix ↔ internal conversion.
+		// We don't care about behavior too close to ± 2^63 Unix seconds.
+		// It is full of wraparounds but will never happen in a reasonable program.
+		// (Or maybe not? See go.dev/issue/20678. In any event, they're not handled today.)
+		ti >>= 1
+
 		return testOne(ti, int64(tns), int64(d))
 	}
 	quick.Check(f1, cfg)
@@ -377,6 +386,7 @@
 		if d < 0 {
 			d = -d
 		}
+		ti >>= 1 // see comment in f1
 		return testOne(ti, int64(tns), int64(d))
 	}
 	quick.Check(f2, cfg)
@@ -399,6 +409,7 @@
 
 	// full generality
 	f4 := func(ti int64, tns int32, di int64) bool {
+		ti >>= 1 // see comment in f1
 		return testOne(ti, int64(tns), di)
 	}
 	quick.Check(f4, cfg)

diff --git a/test/fixedbugs/issue23587.go b/test/fixedbugs/issue23587.go
index 2308992..9040767 100644
--- a/test/fixedbugs/issue23587.go
+++ b/test/fixedbugs/issue23587.go

@@ -7,7 +7,7 @@
 package p
 
 func _(x int) {
-	_ = ~x    // ERROR "unexpected ~"
+	_ = ~x    // unary ~ permitted but the type-checker will complain
 }
 
 func _(x int) {

diff --git a/test/fixedbugs/issue52438.go b/test/fixedbugs/issue52438.go
new file mode 100644
index 0000000..375e727
--- /dev/null
+++ b/test/fixedbugs/issue52438.go

@@ -0,0 +1,39 @@
+// run
+
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+const c1 = iota
+const c2 = iota
+
+const c3 = 0 + iota<<8
+const c4 = 1 + iota<<8
+
+func main() {
+	if c1 != 0 {
+		panic(c1)
+	}
+	if c2 != 0 {
+		panic(c2)
+	}
+
+	if c3 != 0 {
+		panic(c3)
+	}
+	if c4 != 1 {
+		panic(c4)
+	}
+
+	const c5 = iota
+	const c6 = iota
+
+	if c5 != 0 {
+		panic(c5)
+	}
+	if c6 != 0 {
+		panic(c6)
+	}
+}

diff --git a/test/live_syscall.go b/test/live_syscall.go
deleted file mode 100644
index b920ff6..0000000
--- a/test/live_syscall.go
+++ /dev/null

@@ -1,40 +0,0 @@
-// errorcheck -0 -m -live
-
-// +build !windows,!js
-
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Test escape analysis and liveness inferred for syscall.Syscall-like functions.
-
-package p
-
-import (
-	"syscall"
-	"unsafe"
-)
-
-func f(uintptr) // ERROR "assuming arg#1 is unsafe uintptr"
-
-func g() { // ERROR "can inline g"
-	var t int
-	f(uintptr(unsafe.Pointer(&t))) // ERROR "live at call to f: .?autotmp" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
-}
-
-func h() { // ERROR "can inline h"
-	var v int
-	syscall.Syscall(0, 1, uintptr(unsafe.Pointer(&v)), 2) // ERROR "live at call to Syscall: .?autotmp" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
-}
-
-func i() { // ERROR "can inline i"
-	var t int
-	p := unsafe.Pointer(&t)
-	f(uintptr(p))           // ERROR "live at call to f: .?autotmp" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
-}
-
-func j() { // ERROR "can inline j"
-	var v int
-	p := unsafe.Pointer(&v)
-	syscall.Syscall(0, 1, uintptr(p), 2) // ERROR "live at call to Syscall: .?autotmp" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
-}

diff --git a/test/live_uintptrkeepalive.go b/test/live_uintptrkeepalive.go
new file mode 100644
index 0000000..10c4c75
--- /dev/null
+++ b/test/live_uintptrkeepalive.go

@@ -0,0 +1,63 @@
+// errorcheck -0 -m -live -std
+
+// +build !windows,!js
+
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test escape analysis and liveness inferred for uintptrkeepalive functions.
+//
+// This behavior is enabled automatically for function declarations with no
+// bodies (assembly, linkname), as well as explicitly on complete functions
+// with //go:uintptrkeepalive.
+//
+// This is most important for syscall.Syscall (and similiar functions), so we
+// test it explicitly.
+
+package p
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+func implicit(uintptr) // ERROR "assuming arg#1 is unsafe uintptr"
+
+//go:uintptrkeepalive
+//go:nosplit
+func explicit(uintptr) {
+}
+
+func autotmpImplicit() { // ERROR "can inline autotmpImplicit"
+	var t int
+	implicit(uintptr(unsafe.Pointer(&t))) // ERROR "live at call to implicit: .?autotmp" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
+}
+
+func autotmpExplicit() { // ERROR "can inline autotmpExplicit"
+	var t int
+	explicit(uintptr(unsafe.Pointer(&t))) // ERROR "live at call to explicit: .?autotmp" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
+}
+
+func autotmpSyscall() { // ERROR "can inline autotmpSyscall"
+	var v int
+	syscall.Syscall(0, 1, uintptr(unsafe.Pointer(&v)), 2) // ERROR "live at call to Syscall: .?autotmp" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
+}
+
+func localImplicit() { // ERROR "can inline localImplicit"
+	var t int
+	p := unsafe.Pointer(&t)
+	implicit(uintptr(p))           // ERROR "live at call to implicit: .?autotmp" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
+}
+
+func localExplicit() { // ERROR "can inline localExplicit"
+	var t int
+	p := unsafe.Pointer(&t)
+	explicit(uintptr(p))           // ERROR "live at call to explicit: .?autotmp" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
+}
+
+func localSyscall() { // ERROR "can inline localSyscall"
+	var v int
+	p := unsafe.Pointer(&v)
+	syscall.Syscall(0, 1, uintptr(p), 2) // ERROR "live at call to Syscall: .?autotmp" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
+}

diff --git a/test/typeparam/cons.go b/test/typeparam/cons.go
index b7bace4..733e579 100644
--- a/test/typeparam/cons.go
+++ b/test/typeparam/cons.go

@@ -2,7 +2,7 @@
 
 // Copyright 2021 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
-// lice
+// license that can be found in the LICENSE file.
 
 package main
 

diff --git a/test/uintptrkeepalive.go b/test/uintptrkeepalive.go
new file mode 100644
index 0000000..97834dc
--- /dev/null
+++ b/test/uintptrkeepalive.go

@@ -0,0 +1,11 @@
+// errorcheck -std
+
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package p
+
+//go:uintptrkeepalive
+func missingNosplit(uintptr) { // ERROR "go:uintptrkeepalive requires go:nosplit"
+}
commit	ec7f5165ddc680efbac18dc15b4905844d9e8db9	[log] [tgz]
author	Chressie Himpel <chressie@google.com>	Wed Apr 27 20:09:28 2022 +0200
committer	Chressie Himpel <chressie@google.com>	Wed Apr 27 20:09:28 2022 +0200
tree	eacc43345e3d6f0adfda16bfcf66e7e5096a85b9
parent	ca6fd39cf6498d4507fc7cdaced55620c283a503 [diff]
parent	f0ee7fda636408b4f04ca3f3b11788f662c90610 [diff]