libgo: ensure memmove, memset 8 byte atomicity on ppc64x

Go requires that pointer moves are done 8 bytes at a time,
but gccgo uses libc's memmove and memset which does not require
that, and there are some cases where an 8 byte move might be
done as 4+4.

To enforce 8 byte moves for memmove and memset, this adds a
C implementation in libgo/runtime for memmove and memset to be
used on ppc64le and ppc64. Asm implementations were considered
but discarded to avoid different implementations for different
target ISAs.

Fixes golang/go#41428

Change-Id: I4f3a0969a501054efdcbca8b771a07c006cad9b0
Reviewed-on: https://go-review.googlesource.com/c/gofrontend/+/294931
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
diff --git a/libgo/Makefile.am b/libgo/Makefile.am
index eea7ff1..dec9875 100644
--- a/libgo/Makefile.am
+++ b/libgo/Makefile.am
@@ -454,6 +454,7 @@
 	runtime/go-fieldtrack.c \
 	runtime/go-matherr.c \
 	runtime/go-memclr.c \
+	runtime/go-memmove.c \
 	runtime/go-memequal.c \
 	runtime/go-nanotime.c \
 	runtime/go-now.c \
diff --git a/libgo/Makefile.in b/libgo/Makefile.in
index 8938eba..da72149 100644
--- a/libgo/Makefile.in
+++ b/libgo/Makefile.in
@@ -244,8 +244,9 @@
 	runtime/go-caller.lo runtime/go-callers.lo runtime/go-cgo.lo \
 	runtime/go-construct-map.lo runtime/go-ffi.lo \
 	runtime/go-fieldtrack.lo runtime/go-matherr.lo \
-	runtime/go-memclr.lo runtime/go-memequal.lo \
-	runtime/go-nanotime.lo runtime/go-now.lo runtime/go-nosys.lo \
+	runtime/go-memclr.lo runtime/go-memmove.lo \
+	runtime/go-memequal.lo runtime/go-nanotime.lo \
+	runtime/go-now.lo runtime/go-nosys.lo \
 	runtime/go-reflect-call.lo runtime/go-setenv.lo \
 	runtime/go-signal.lo runtime/go-unsafe-pointer.lo \
 	runtime/go-unsetenv.lo runtime/go-unwind.lo \
@@ -906,6 +907,7 @@
 	runtime/go-fieldtrack.c \
 	runtime/go-matherr.c \
 	runtime/go-memclr.c \
+	runtime/go-memmove.c \
 	runtime/go-memequal.c \
 	runtime/go-nanotime.c \
 	runtime/go-now.c \
@@ -1367,6 +1369,8 @@
 	runtime/$(DEPDIR)/$(am__dirstamp)
 runtime/go-memclr.lo: runtime/$(am__dirstamp) \
 	runtime/$(DEPDIR)/$(am__dirstamp)
+runtime/go-memmove.lo: runtime/$(am__dirstamp) \
+	runtime/$(DEPDIR)/$(am__dirstamp)
 runtime/go-memequal.lo: runtime/$(am__dirstamp) \
 	runtime/$(DEPDIR)/$(am__dirstamp)
 runtime/go-nanotime.lo: runtime/$(am__dirstamp) \
@@ -1435,6 +1439,7 @@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-matherr.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-memclr.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-memequal.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-memmove.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-nanotime.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-nosys.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-now.Plo@am__quote@
diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go
index 6d20c38..dea7234 100644
--- a/libgo/go/runtime/stubs.go
+++ b/libgo/go/runtime/stubs.go
@@ -106,9 +106,7 @@
 	memclrNoHeapPointers(ptr, n)
 }
 
-// memmove copies n bytes from "from" to "to".
 //go:noescape
-//extern __builtin_memmove
 func memmove(to, from unsafe.Pointer, n uintptr)
 
 //go:linkname reflect_memmove reflect.memmove
diff --git a/libgo/runtime/go-memclr.c b/libgo/runtime/go-memclr.c
index b5d4975..53b8117 100644
--- a/libgo/runtime/go-memclr.c
+++ b/libgo/runtime/go-memclr.c
@@ -13,5 +13,48 @@
 void
 memclrNoHeapPointers (void *p1, uintptr len)
 {
-  __builtin_memset (p1, 0, len);
+
+#if !defined(__PPC64__)
+  __builtin_memset(p1, 0, len);
+#else
+  int64 rem,drem,i;
+  uint64 offset;
+  volatile uint64 *vp;
+
+  if (len == 0) {
+    return;
+  }
+  rem = len;
+
+  offset = (uint64)p1 % 8;
+  // This memset is OK since it can't contain
+  // an 8 byte aligned pointer.
+  if ((rem < 8) || (offset > 0 && offset+rem <= 16)) {
+    __builtin_memset(p1, 0, rem);
+    return;
+  }
+  // Move initial bytes to get to 8 byte boundary
+  if (offset > 0) {
+    __builtin_memset(p1, 0, 8-offset);
+    p1 = (void*)((char*)p1+8-offset);
+    rem -= 8-offset;
+  }
+
+  // If at least 8 bytes left, clear
+  drem = rem>>3;
+
+  vp = (volatile uint64*)(p1);
+  // Without the use of volatile here, the compiler
+  // might convert the loop into a memset.
+  for (i=0; i<drem; i++) {
+    *vp = 0;
+    vp++;
+    rem -= 8;
+  }
+  p1 = (void*)((char*)p1 + 8*drem);
+  // Clear any remaining
+  if (rem > 0) {
+    __builtin_memset (p1, 0, rem);
+  }
+#endif
 }
diff --git a/libgo/runtime/go-memmove.c b/libgo/runtime/go-memmove.c
new file mode 100644
index 0000000..1ca3f48
--- /dev/null
+++ b/libgo/runtime/go-memmove.c
@@ -0,0 +1,89 @@
+/* go-memmove.c -- memmove
+
+   Copyright 2021 The Go Authors. All rights reserved.
+   Use of this source code is governed by a BSD-style
+   license that can be found in the LICENSE file.  */
+
+#include "runtime.h"
+
+void gomemmove(void *, void *, uintptr)
+  __asm__ (GOSYM_PREFIX "runtime.memmove")
+  __attribute__ ((no_split_stack));
+
+// This implementation is necessary since
+// the __builtin_memmove might use __libc_memmove
+// which doesn't require atomicity of 8 byte
+// moves.
+
+void
+gomemmove (void *dst, void *src, uintptr len)
+{
+#if !defined(__PPC64__)
+  __builtin_memmove(dst, src, len);
+#else
+  uint64 offset, tail;
+  int64 rem;
+  uint64 dwords;
+  uint64 i;
+  char *bdst,*bsrc;
+
+  rem = len;
+
+  if (len == 0) {
+	return;
+  }
+
+  // If src and dst don't have the same 8 byte alignment then
+  // there is no issue with copying pointer atomicity. Use the
+  // builtin.
+  if (((uint64)dst % 8) != ((uint64)src % 8) || len < 8) {
+	__builtin_memmove(dst, src, len);
+	return;
+  }
+
+  // Length >= 8 && same ptr alignment
+  offset = (uint64)dst % 8;
+
+  // If not 8 byte alignment, move the intial bytes.
+  if (offset > 0) {
+	__builtin_memmove(dst, src, 8-offset);
+	dst += (8-offset);
+	src += (8-offset);
+	rem -= (8-offset);
+  }
+
+  // Move the tail bytes to make the backward move
+  // easier.
+  tail = rem % 8;
+  if (tail > 0) {
+	__builtin_memmove(dst+rem-tail, src+rem-tail, tail);
+	rem -= tail;
+  }
+
+  if (rem == 0) {
+	return;
+  }
+
+  // Must now be 8 byte alignment and rem is multiple of 8.
+  dwords = len>>3;
+
+  // Determine if a backwards move is needed
+  // Forward or backward, move all doublewords
+
+  if ((uint64)(dst - src) < (uint64)rem) {
+	bdst = dst+rem-8;
+	bsrc = src+rem-8;
+	for (i = 0; i<dwords; i++) {
+		*(uint64*)bdst = *(uint64*)bsrc;
+		bdst -= 8;
+		bsrc -= 8;
+	}
+  } else {
+	for (i = 0; i<dwords; i++) {
+		*(uint64*)dst = *(uint64*)src;
+		dst += 8;
+		src += 8;
+	}
+  }
+#endif
+}
diff --git a/libgo/runtime/runtime.h b/libgo/runtime/runtime.h
index 3a65d44..b3dc4fd 100644
--- a/libgo/runtime/runtime.h
+++ b/libgo/runtime/runtime.h
@@ -221,7 +221,8 @@
 void	runtime_printf(const char*, ...);
 int32	runtime_snprintf(byte*, int32, const char*, ...);
 #define runtime_mcmp(a, b, s) __builtin_memcmp((a), (b), (s))
-#define runtime_memmove(a, b, s) __builtin_memmove((a), (b), (s))
+void runtime_memmove(void*, void*, uint64)
+  __asm__ (GOSYM_PREFIX "runtime.memmove");
 String	runtime_gostringnocopy(const byte*)
   __asm__ (GOSYM_PREFIX "runtime.gostringnocopy");
 void	runtime_ginit(void)