runtime: implement cheaper context switch on Linux/AMD64

Currently, goroutine switches are implemented with libc
getcontext/setcontext functions, which saves/restores the machine
register states and also the signal context. This does more than
what we need, and performs an expensive syscall.

This CL implements a simplified version of getcontext/setcontext,
in assembly, that only saves/restores the necessary part, i.e.
the callee-save registers, and the PC, SP. A simplified version
of makecontext, written in C, is also added. Currently this is
only implemented on Linux/AMD64.

Change-Id: I326347c6530663747c2c46f27e9d1bdc7f073290
Reviewed-on: https://go-review.googlesource.com/c/gofrontend/+/178298
Reviewed-by: Ian Lance Taylor <iant@golang.org>
diff --git a/libgo/Makefile.am b/libgo/Makefile.am
index ebbdee8..8a3844c 100644
--- a/libgo/Makefile.am
+++ b/libgo/Makefile.am
@@ -481,6 +481,7 @@
 	runtime/runtime_c.c \
 	runtime/stack.c \
 	runtime/yield.c \
+	runtime/go-context.S \
 	$(rtems_task_variable_add_file) \
 	$(runtime_getncpu_file)
 
diff --git a/libgo/Makefile.in b/libgo/Makefile.in
index 737b01e..20e2a49 100644
--- a/libgo/Makefile.in
+++ b/libgo/Makefile.in
@@ -253,7 +253,8 @@
 	runtime/go-unwind.lo runtime/go-varargs.lo \
 	runtime/env_posix.lo runtime/panic.lo runtime/print.lo \
 	runtime/proc.lo runtime/runtime_c.lo runtime/stack.lo \
-	runtime/yield.lo $(am__objects_1) $(am__objects_2)
+	runtime/yield.lo runtime/go-context.lo $(am__objects_1) \
+	$(am__objects_2)
 am_libgo_llgo_la_OBJECTS = $(am__objects_3)
 libgo_llgo_la_OBJECTS = $(am_libgo_llgo_la_OBJECTS)
 AM_V_lt = $(am__v_lt_@AM_V@)
@@ -287,6 +288,16 @@
 depcomp = $(SHELL) $(top_srcdir)/../depcomp
 am__depfiles_maybe = depfiles
 am__mv = mv -f
+CPPASCOMPILE = $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CCASFLAGS) $(CCASFLAGS)
+LTCPPASCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CCASFLAGS) $(CCASFLAGS)
+AM_V_CPPAS = $(am__v_CPPAS_@AM_V@)
+am__v_CPPAS_ = $(am__v_CPPAS_@AM_DEFAULT_V@)
+am__v_CPPAS_0 = @echo "  CPPAS   " $@;
+am__v_CPPAS_1 = 
 COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
 	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
 LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
@@ -380,6 +391,9 @@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CC_FOR_BUILD = @CC_FOR_BUILD@
 CFLAGS = @CFLAGS@
@@ -512,6 +526,7 @@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
@@ -901,6 +916,7 @@
 	runtime/runtime_c.c \
 	runtime/stack.c \
 	runtime/yield.c \
+	runtime/go-context.S \
 	$(rtems_task_variable_add_file) \
 	$(runtime_getncpu_file)
 
@@ -1158,7 +1174,7 @@
 	$(MAKE) $(AM_MAKEFLAGS) all-recursive
 
 .SUFFIXES:
-.SUFFIXES: .c .go .gox .o .obj .lo .a
+.SUFFIXES: .c .go .gox .o .obj .lo .a .S
 am--refresh: Makefile
 	@:
 $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/../multilib.am $(am__configure_deps)
@@ -1375,6 +1391,8 @@
 	runtime/$(DEPDIR)/$(am__dirstamp)
 runtime/yield.lo: runtime/$(am__dirstamp) \
 	runtime/$(DEPDIR)/$(am__dirstamp)
+runtime/go-context.lo: runtime/$(am__dirstamp) \
+	runtime/$(DEPDIR)/$(am__dirstamp)
 runtime/rtems-task-variable-add.lo: runtime/$(am__dirstamp) \
 	runtime/$(DEPDIR)/$(am__dirstamp)
 runtime/getncpu-none.lo: runtime/$(am__dirstamp) \
@@ -1421,6 +1439,7 @@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-cdiv.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-cgo.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-construct-map.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-context.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-ffi.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-fieldtrack.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/go-matherr.Plo@am__quote@
@@ -1451,6 +1470,30 @@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/stack.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@runtime/$(DEPDIR)/yield.Plo@am__quote@
 
+.S.o:
+@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
+@am__fastdepCCAS_TRUE@	$(CPPASCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCCAS_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ $<
+
+.S.obj:
+@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\
+@am__fastdepCCAS_TRUE@	$(CPPASCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\
+@am__fastdepCCAS_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.S.lo:
+@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\
+@am__fastdepCCAS_TRUE@	$(LTCPPASCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCCAS_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Plo
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(LTCPPASCOMPILE) -c -o $@ $<
+
 .c.o:
 @am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
 @am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
diff --git a/libgo/aclocal.m4 b/libgo/aclocal.m4
index b55bb82..951aed3 100644
--- a/libgo/aclocal.m4
+++ b/libgo/aclocal.m4
@@ -56,6 +56,26 @@
   [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 
+# Figure out how to run the assembler.                      -*- Autoconf -*-
+
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_PROG_AS
+# ----------
+AC_DEFUN([AM_PROG_AS],
+[# By default we simply use the C compiler to build assembly code.
+AC_REQUIRE([AC_PROG_CC])
+test "${CCAS+set}" = set || CCAS=$CC
+test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
+AC_ARG_VAR([CCAS],      [assembler compiler command (defaults to CC)])
+AC_ARG_VAR([CCASFLAGS], [assembler compiler flags (defaults to CFLAGS)])
+_AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
+])
+
 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
 
 # Copyright (C) 2001-2017 Free Software Foundation, Inc.
diff --git a/libgo/configure b/libgo/configure
index 1e33a57..837f6fe 100755
--- a/libgo/configure
+++ b/libgo/configure
@@ -729,6 +729,11 @@
 MAINT
 MAINTAINER_MODE_FALSE
 MAINTAINER_MODE_TRUE
+am__fastdepCCAS_FALSE
+am__fastdepCCAS_TRUE
+CCASDEPMODE
+CCASFLAGS
+CCAS
 GOFLAGS
 GOC
 am__fastdepCC_FALSE
@@ -808,6 +813,7 @@
 docdir
 oldincludedir
 includedir
+runstatedir
 localstatedir
 sharedstatedir
 sysconfdir
@@ -889,6 +895,7 @@
 sysconfdir='${prefix}/etc'
 sharedstatedir='${prefix}/com'
 localstatedir='${prefix}/var'
+runstatedir='${localstatedir}/run'
 includedir='${prefix}/include'
 oldincludedir='/usr/include'
 docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
@@ -1141,6 +1148,15 @@
   | -silent | --silent | --silen | --sile | --sil)
     silent=yes ;;
 
+  -runstatedir | --runstatedir | --runstatedi | --runstated \
+  | --runstate | --runstat | --runsta | --runst | --runs \
+  | --run | --ru | --r)
+    ac_prev=runstatedir ;;
+  -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
+  | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
+  | --run=* | --ru=* | --r=*)
+    runstatedir=$ac_optarg ;;
+
   -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
     ac_prev=sbindir ;;
   -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
@@ -1278,7 +1294,7 @@
 for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
 		datadir sysconfdir sharedstatedir localstatedir includedir \
 		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
-		libdir localedir mandir
+		libdir localedir mandir runstatedir
 do
   eval ac_val=\$$ac_var
   # Remove trailing slashes.
@@ -1431,6 +1447,7 @@
   --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
   --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
   --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
+  --runstatedir=DIR       modifiable per-process data [LOCALSTATEDIR/run]
   --libdir=DIR            object code libraries [EPREFIX/lib]
   --includedir=DIR        C header files [PREFIX/include]
   --oldincludedir=DIR     C header files for non-gcc [/usr/include]
@@ -1510,6 +1527,8 @@
               you have headers in a nonstandard directory <include dir>
   GOC         Go compiler command
   GOFLAGS     Go compiler flags
+  CCAS        assembler compiler command (defaults to CC)
+  CCASFLAGS   assembler compiler flags (defaults to CFLAGS)
   CPP         C preprocessor
 
 Use these variables to override the choices made by `configure' or to help
@@ -4556,6 +4575,139 @@
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+# By default we simply use the C compiler to build assembly code.
+
+test "${CCAS+set}" = set || CCAS=$CC
+test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
+
+
+
+depcc="$CCAS"   am_compiler_list=
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
+$as_echo_n "checking dependency style of $depcc... " >&6; }
+if ${am_cv_CCAS_dependencies_compiler_type+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_CCAS_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  fi
+  am__universal=false
+
+
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
+    am__obj=sub/conftest.${OBJEXT-o}
+    am__minus_obj="-o $am__obj"
+    case $depmode in
+    gcc)
+      # This depmode causes a compiler race in universal mode.
+      test "$am__universal" = false || continue
+      ;;
+    nosideeffect)
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
+      # This compiler won't grok '-c -o', but also, the minuso test has
+      # not run yet.  These depmodes are late enough in the game, and
+      # so weak that their functioning should not be impacted.
+      am__obj=conftest.${OBJEXT-o}
+      am__minus_obj=
+      ;;
+    none) break ;;
+    esac
+    if depmode=$depmode \
+       source=sub/conftest.c object=$am__obj \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_CCAS_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_CCAS_dependencies_compiler_type=none
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CCAS_dependencies_compiler_type" >&5
+$as_echo "$am_cv_CCAS_dependencies_compiler_type" >&6; }
+CCASDEPMODE=depmode=$am_cv_CCAS_dependencies_compiler_type
+
+ if
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_CCAS_dependencies_compiler_type" = gcc3; then
+  am__fastdepCCAS_TRUE=
+  am__fastdepCCAS_FALSE='#'
+else
+  am__fastdepCCAS_TRUE='#'
+  am__fastdepCCAS_FALSE=
+fi
+
+
 
 
 
@@ -11344,7 +11496,7 @@
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 11347 "configure"
+#line 11499 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -11450,7 +11602,7 @@
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 11453 "configure"
+#line 11605 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -15838,6 +15990,10 @@
   as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${am__fastdepCCAS_TRUE}" && test -z "${am__fastdepCCAS_FALSE}"; then
+  as_fn_error $? "conditional \"am__fastdepCCAS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${MAINTAINER_MODE_TRUE}" && test -z "${MAINTAINER_MODE_FALSE}"; then
   as_fn_error $? "conditional \"MAINTAINER_MODE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
diff --git a/libgo/configure.ac b/libgo/configure.ac
index 0bd328c..0ae8162 100644
--- a/libgo/configure.ac
+++ b/libgo/configure.ac
@@ -26,6 +26,7 @@
 m4_define([_AC_ARG_VAR_PRECIOUS],[])
 AC_PROG_CC
 AC_PROG_GO
+AM_PROG_AS
 m4_rename_force([glibgo_PRECIOUS],[_AC_ARG_VAR_PRECIOUS])
 
 AC_SUBST(CFLAGS)
diff --git a/libgo/runtime/go-context.S b/libgo/runtime/go-context.S
new file mode 100644
index 0000000..0cd2242
--- /dev/null
+++ b/libgo/runtime/go-context.S
@@ -0,0 +1,69 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This provides a simplified version of getcontext and
+// setcontext. They are like the corresponding functions
+// in libc, but we only save/restore the callee-save
+// registers and PC, SP. Unlike the libc functions, we
+// don't save/restore the signal masks and floating point
+// environment.
+
+#if defined(__x86_64__) && defined(__linux__) && !defined(__CET__)
+
+#define RBP_OFF	(0*8)
+#define RBX_OFF	(1*8)
+#define R12_OFF	(2*8)
+#define R13_OFF	(3*8)
+#define R14_OFF	(4*8)
+#define R15_OFF	(5*8)
+#define SP_OFF	(6*8)
+#define PC_OFF	(7*8)
+
+.globl __go_getcontext
+.text
+__go_getcontext:
+	movq	%rbx, RBX_OFF(%rdi)
+	movq	%rbp, RBP_OFF(%rdi)
+	movq	%r12, R12_OFF(%rdi)
+	movq	%r13, R13_OFF(%rdi)
+	movq	%r14, R14_OFF(%rdi)
+	movq	%r15, R15_OFF(%rdi)
+
+	movq	(%rsp), %rax	// return PC
+	movq	%rax, PC_OFF(%rdi)
+	leaq	8(%rsp), %rax	// the SP before pushing return PC
+	movq	%rax, SP_OFF(%rdi)
+
+	ret
+
+.globl __go_setcontext
+.text
+__go_setcontext:
+	movq	RBX_OFF(%rdi), %rbx
+	movq	RBP_OFF(%rdi), %rbp
+	movq	R12_OFF(%rdi), %r12
+	movq	R13_OFF(%rdi), %r13
+	movq	R14_OFF(%rdi), %r14
+	movq	R15_OFF(%rdi), %r15
+	movq	SP_OFF(%rdi), %rsp
+	movq	PC_OFF(%rdi), %rdx
+
+	jmp	*%rdx
+
+.globl __go_makecontext
+.text
+__go_makecontext:
+	addq	%rcx, %rdx
+
+	// Align the SP, and push a dummy return address.
+	andq	$~0xfULL, %rdx
+	subq	$8, %rdx
+	movq	$0, (%rdx)
+
+	movq	%rdx, SP_OFF(%rdi)
+	movq	%rsi, PC_OFF(%rdi)
+
+	ret
+
+#endif
diff --git a/libgo/runtime/proc.c b/libgo/runtime/proc.c
index 1569b5b..5ef421f 100644
--- a/libgo/runtime/proc.c
+++ b/libgo/runtime/proc.c
@@ -75,7 +75,7 @@
 }
 
 static inline void
-fixcontext(ucontext_t *c __attribute__ ((unused)))
+fixcontext(__go_context_t *c __attribute__ ((unused)))
 {
 }
 
@@ -182,18 +182,18 @@
 // Go, and Go has no simple way to align a field to such a boundary.
 // So we make the field larger in runtime2.go and pick an appropriate
 // offset within the field here.
-static ucontext_t*
+static __go_context_t*
 ucontext_arg(uintptr_t* go_ucontext)
 {
 	uintptr_t p = (uintptr_t)go_ucontext;
-	size_t align = __alignof__(ucontext_t);
+	size_t align = __alignof__(__go_context_t);
 	if(align > 16) {
 		// We only ensured space for up to a 16 byte alignment
 		// in libgo/go/runtime/runtime2.go.
-		runtime_throw("required alignment of ucontext_t too large");
+		runtime_throw("required alignment of __go_context_t too large");
 	}
 	p = (p + align - 1) &~ (uintptr_t)(align - 1);
-	return (ucontext_t*)p;
+	return (__go_context_t*)p;
 }
 
 // We can not always refer to the TLS variables directly.  The
@@ -289,7 +289,7 @@
 	g = newg;
 	newg->fromgogo = true;
 	fixcontext(ucontext_arg(&newg->context[0]));
-	setcontext(ucontext_arg(&newg->context[0]));
+	__go_setcontext(ucontext_arg(&newg->context[0]));
 	runtime_throw("gogo setcontext returned");
 }
 
@@ -328,7 +328,7 @@
 		gp->gcnextsp2 = (uintptr)(secondary_stack_pointer());
 #endif
 		gp->fromgogo = false;
-		getcontext(ucontext_arg(&gp->context[0]));
+		__go_getcontext(ucontext_arg(&gp->context[0]));
 
 		// When we return from getcontext, we may be running
 		// in a new thread.  That means that g may have
@@ -358,7 +358,7 @@
 		g = mp->g0;
 
 		fixcontext(ucontext_arg(&mp->g0->context[0]));
-		setcontext(ucontext_arg(&mp->g0->context[0]));
+		__go_setcontext(ucontext_arg(&mp->g0->context[0]));
 		runtime_throw("runtime: mcall function returned");
 	}
 }
@@ -450,7 +450,7 @@
 #ifdef USING_SPLIT_STACK
 	__splitstack_getcontext((void*)(&me->stackcontext[0]));
 #endif
-	getcontext(ucontext_arg(&me->context[0]));
+	__go_getcontext(ucontext_arg(&me->context[0]));
 
 	if (gp->traceback != 0) {
 		runtime_gogo(gp);
@@ -493,7 +493,7 @@
 #ifdef USING_SPLIT_STACK
 	__splitstack_getcontext((void*)(&me->stackcontext[0]));
 #endif
-	getcontext(ucontext_arg(&me->context[0]));
+	__go_getcontext(ucontext_arg(&me->context[0]));
 
 	if(me->entry != nil) {
 		// Got here from mcall.
@@ -574,7 +574,7 @@
 
 	// Save the currently active context.  This will return
 	// multiple times via the setcontext call in mcall.
-	getcontext(ucontext_arg(&gp->context[0]));
+	__go_getcontext(ucontext_arg(&gp->context[0]));
 
 	if(gp->traceback != 0) {
 		// Got here from getTraceback.
@@ -652,7 +652,7 @@
 	gp->gcinitialsp2 = secondary_stack_pointer();
 	gp->gcnextsp2 = (uintptr)(gp->gcinitialsp2);
 #endif
-	getcontext(ucontext_arg(&gp->context[0]));
+	__go_getcontext(ucontext_arg(&gp->context[0]));
 
 	if(gp->entry != nil) {
 		// Got here from mcall.
@@ -672,13 +672,11 @@
 // makeGContext makes a new context for a g.
 void
 makeGContext(G* gp, byte* sp, uintptr spsize) {
-	ucontext_t *uc;
+	__go_context_t *uc;
 
 	uc = ucontext_arg(&gp->context[0]);
-	getcontext(uc);
-	uc->uc_stack.ss_sp = sp;
-	uc->uc_stack.ss_size = (size_t)spsize;
-	makecontext(uc, kickoff, 0);
+	__go_getcontext(uc);
+	__go_makecontext(uc, kickoff, sp, (size_t)spsize);
 }
 
 // The goroutine g is about to enter a system call.
@@ -700,7 +698,7 @@
 	// Save the registers in the g structure so that any pointers
 	// held in registers will be seen by the garbage collector.
 	if (!runtime_usestackmaps)
-		getcontext(ucontext_arg(&g->gcregs[0]));
+		__go_getcontext(ucontext_arg(&g->gcregs[0]));
 
 	// Note that if this function does save any registers itself,
 	// we might store the wrong value in the call to getcontext.
@@ -747,7 +745,7 @@
 	// Save the registers in the g structure so that any pointers
 	// held in registers will be seen by the garbage collector.
 	if (!runtime_usestackmaps)
-		getcontext(ucontext_arg(&g->gcregs[0]));
+		__go_getcontext(ucontext_arg(&g->gcregs[0]));
 
 	// See comment in runtime_entersyscall.
 	doentersyscallblock((uintptr)runtime_getcallerpc(),
diff --git a/libgo/runtime/runtime.h b/libgo/runtime/runtime.h
index 71c1a3e..a421dea 100644
--- a/libgo/runtime/runtime.h
+++ b/libgo/runtime/runtime.h
@@ -510,3 +510,20 @@
 // older versions of glibc when a SIGPROF signal arrives while
 // collecting a backtrace.
 extern uint32 __go_runtime_in_callers;
+
+// Cheaper context switch functions.  Currently only defined on
+// Linux/AMD64.
+#if defined(__x86_64__) && defined(__linux__) && !defined(__CET__)
+typedef struct {
+	uint64 regs[8];
+} __go_context_t;
+int __go_getcontext(__go_context_t*);
+int __go_setcontext(__go_context_t*);
+void __go_makecontext(__go_context_t*, void (*)(), void*, size_t);
+#else
+#define __go_context_t	ucontext_t
+#define __go_getcontext(c)	getcontext(c)
+#define __go_setcontext(c)	setcontext(c)
+#define __go_makecontext(c, fn, sp, size) \
+	((c)->uc_stack.ss_sp = sp, (c)->uc_stack.ss_size = size, makecontext(c, fn, 0))
+#endif
diff --git a/libgo/testsuite/Makefile.in b/libgo/testsuite/Makefile.in
index 1307589..4b4d07d 100644
--- a/libgo/testsuite/Makefile.in
+++ b/libgo/testsuite/Makefile.in
@@ -141,6 +141,9 @@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CC_FOR_BUILD = @CC_FOR_BUILD@
 CFLAGS = @CFLAGS@
@@ -273,6 +276,7 @@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@