runtime: do not generate code during runtime in windows NewCallback

Update #5494

R=golang-dev, minux.ma, rsc, iant
CC=golang-dev
https://golang.org/cl/10368043
diff --git a/src/cmd/dist/a.h b/src/cmd/dist/a.h
index 73c1264..d8a13f9 100644
--- a/src/cmd/dist/a.h
+++ b/src/cmd/dist/a.h
@@ -94,6 +94,7 @@
 
 // buildruntime.c
 void	mkzasm(char*, char*);
+void	mkzsys(char*, char*);
 void	mkzgoarch(char*, char*);
 void	mkzgoos(char*, char*);
 void	mkzruntimedefs(char*, char*);
diff --git a/src/cmd/dist/build.c b/src/cmd/dist/build.c
index cdab81d..ba32d3e 100644
--- a/src/cmd/dist/build.c
+++ b/src/cmd/dist/build.c
@@ -528,6 +528,7 @@
 	}},
 	{"pkg/runtime", {
 		"zasm_$GOOS_$GOARCH.h",
+		"zsys_$GOOS_$GOARCH.s",
 		"zgoarch_$GOARCH.go",
 		"zgoos_$GOOS.go",
 		"zruntime_defs_$GOOS_$GOARCH.go",
@@ -552,6 +553,7 @@
 	{"opnames.h", gcopnames},
 	{"enam.c", mkenam},
 	{"zasm_", mkzasm},
+	{"zsys_", mkzsys},
 	{"zgoarch_", mkzgoarch},
 	{"zgoos_", mkzgoos},
 	{"zruntime_defs_", mkzruntimedefs},
diff --git a/src/cmd/dist/buildruntime.c b/src/cmd/dist/buildruntime.c
index 8f3fc541..2d221eb 100644
--- a/src/cmd/dist/buildruntime.c
+++ b/src/cmd/dist/buildruntime.c
@@ -178,6 +178,8 @@
 	},
 };
 
+#define MAXWINCB 2000 /* maximum number of windows callbacks allowed */
+
 // mkzasm writes zasm_$GOOS_$GOARCH.h,
 // which contains struct offsets for use by
 // assembly files.  It also writes a copy to the work space
@@ -249,6 +251,8 @@
 				aggr = "gobuf";
 			else if(streq(fields.p[1], "WinCall"))
 				aggr = "wincall";
+			else if(streq(fields.p[1], "WinCallbackContext"))
+				aggr = "cbctxt";
 			else if(streq(fields.p[1], "SEH"))
 				aggr = "seh";
 		}
@@ -262,6 +266,11 @@
 			bwritestr(&out, bprintf(&b, "#define %s_%s %s\n", aggr, fields.p[n-1], fields.p[n-2]));
 		}
 	}
+
+	// Some #defines that are used for .c files.
+	if(streq(goos, "windows")) {
+		bwritestr(&out, bprintf(&b, "#define cb_max %d\n", MAXWINCB));
+	}
 	
 	// Write both to file and to workdir/zasm_GOOS_GOARCH.h.
 	writefile(&out, file, 0);
@@ -275,6 +284,41 @@
 	vfree(&fields);
 }
 
+// mkzsys writes zsys_$GOOS_$GOARCH.h,
+// which contains arch or os specific asm code.
+// 
+void
+mkzsys(char *dir, char *file)
+{
+	int i;
+	Buf out;
+
+	USED(dir);
+	
+	binit(&out);
+	
+	bwritestr(&out, "// auto generated by go tool dist\n\n");
+	if(streq(goos, "windows")) {
+		bwritef(&out,
+			"// runtime·callbackasm is called by external code to\n"
+			"// execute Go implemented callback function. It is not\n"
+			"// called from the start, instead runtime·compilecallback\n"
+			"// always returns address into runtime·callbackasm offset\n"
+			"// appropriately so different callbacks start with different\n"
+			"// CALL instruction in runtime·callbackasm. This determines\n"
+			"// which Go callback function is executed later on.\n"
+			"TEXT runtime·callbackasm(SB),7,$0\n");
+		for(i=0; i<MAXWINCB; i++) {
+			bwritef(&out, "\tCALL\truntime·callbackasm1(SB)\n");
+		}
+		bwritef(&out, "\tRET\n");
+	}
+
+	writefile(&out, file, 0);
+	
+	bfree(&out);
+}
+
 static char *runtimedefs[] = {
 	"proc.c",
 	"iface.c",
diff --git a/src/pkg/runtime/callback_windows.c b/src/pkg/runtime/callback_windows.c
new file mode 100644
index 0000000..88ee53b
--- /dev/null
+++ b/src/pkg/runtime/callback_windows.c
@@ -0,0 +1,76 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "type.h"
+#include "typekind.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "zasm_GOOS_GOARCH.h"
+
+typedef	struct	Callbacks	Callbacks;
+struct	Callbacks {
+	Lock;
+	WinCallbackContext*	ctxt[cb_max];
+	int32			n;
+};
+
+static	Callbacks	cbs;
+
+WinCallbackContext** runtime·cbctxts; // to simplify access to cbs.ctxt in sys_windows_*.s
+
+// Call back from windows dll into go.
+byte *
+runtime·compilecallback(Eface fn, bool cleanstack)
+{
+	FuncType *ft;
+	Type *t;
+	int32 argsize, i, n;
+	WinCallbackContext *c;
+
+	if(fn.type == nil || fn.type->kind != KindFunc)
+		runtime·panicstring("compilecallback: not a function");
+	ft = (FuncType*)fn.type;
+	if(ft->out.len != 1)
+		runtime·panicstring("compilecallback: function must have one output parameter");
+	if(((Type**)ft->out.array)[0]->size != sizeof(uintptr))
+		runtime·panicstring("compilecallback: output parameter size is wrong");
+	argsize = 0;
+	for(i=0; i<ft->in.len; i++) {
+		t = ((Type**)ft->in.array)[i];
+		if(t->size > sizeof(uintptr))
+			runtime·panicstring("compilecallback: input parameter size is wrong");
+		argsize += sizeof(uintptr);
+	}
+
+	runtime·lock(&cbs);
+	if(runtime·cbctxts == nil)
+		runtime·cbctxts = &(cbs.ctxt[0]);
+	n = cbs.n;
+	for(i=0; i<n; i++) {
+		if(cbs.ctxt[i]->gobody == fn.data) {
+			runtime·unlock(&cbs);
+			// runtime·callbackasm is just a series of CALL instructions
+			// (each is 5 bytes long), and we want callback to arrive at
+			// correspondent call instruction instead of start of
+			// runtime·callbackasm.
+			return (byte*)runtime·callbackasm + i * 5;
+		}
+	}
+	if(n >= cb_max)
+		runtime·throw("too many callback functions");
+	c = runtime·mal(sizeof *c);
+	c->gobody = fn.data;
+	c->argsize = argsize;
+	if(cleanstack && argsize!=0)
+		c->restorestack = argsize;
+	else
+		c->restorestack = 0;
+	cbs.ctxt[n] = c;
+	cbs.n++;
+	runtime·unlock(&cbs);
+
+	// as before
+	return (byte*)runtime·callbackasm + n * 5;
+}
diff --git a/src/pkg/runtime/callback_windows_386.c b/src/pkg/runtime/callback_windows_386.c
deleted file mode 100644
index 880588d..0000000
--- a/src/pkg/runtime/callback_windows_386.c
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "runtime.h"
-#include "type.h"
-#include "typekind.h"
-#include "defs_GOOS_GOARCH.h"
-#include "os_GOOS.h"
-
-// Will keep all callbacks in a linked list, so they don't get garbage collected.
-typedef	struct	Callback	Callback;
-struct	Callback {
-	Callback*	link;
-	void*		gobody;
-	byte		asmbody;
-};
-
-typedef	struct	Callbacks	Callbacks;
-struct	Callbacks {
-	Lock;
-	Callback*	link;
-	int32		n;
-};
-
-static	Callbacks	cbs;
-
-// Call back from windows dll into go.
-byte *
-runtime·compilecallback(Eface fn, bool cleanstack)
-{
-	FuncType *ft;
-	Type *t;
-	int32 argsize, i, n;
-	byte *p;
-	Callback *c;
-
-	if(fn.type == nil || fn.type->kind != KindFunc)
-		runtime·panicstring("compilecallback: not a function");
-	ft = (FuncType*)fn.type;
-	if(ft->out.len != 1)
-		runtime·panicstring("compilecallback: function must have one output parameter");
-	if(((Type**)ft->out.array)[0]->size != sizeof(uintptr))
-		runtime·panicstring("compilecallback: output parameter size is wrong");
-	argsize = 0;
-	for(i=0; i<ft->in.len; i++) {
-		t = ((Type**)ft->in.array)[i];
-		if(t->size > sizeof(uintptr))
-			runtime·panicstring("compilecallback: input parameter size is wrong");
-		argsize += sizeof(uintptr);
-	}
-
-	// compute size of new fn.
-	// must match code laid out below.
-	n = 1+4;		// MOVL fn, AX
-	n += 1+4;		// MOVL argsize, DX
-	n += 1+4;		// MOVL callbackasm, CX
-	n += 2;			// CALL CX
-	n += 1;			// RET
-	if(cleanstack && argsize!=0)
-		n += 2;		// ... argsize
-
-	runtime·lock(&cbs);
-	for(c = cbs.link; c != nil; c = c->link) {
-		if(c->gobody == fn.data) {
-			runtime·unlock(&cbs);
-			return &c->asmbody;
-		}
-	}
-	if(cbs.n >= 2000)
-		runtime·throw("too many callback functions");
-	c = runtime·mal(sizeof *c + n);
-	c->gobody = fn.data;
-	c->link = cbs.link;
-	cbs.link = c;
-	cbs.n++;
-	runtime·unlock(&cbs);
-
-	p = &c->asmbody;
-
-	// MOVL fn, AX
-	*p++ = 0xb8;
-	*(uint32*)p = (uint32)(fn.data);
-	p += 4;
-
-	// MOVL argsize, DX
-	*p++ = 0xba;
-	*(uint32*)p = argsize;
-	p += 4;
-
-	// MOVL callbackasm, CX
-	*p++ = 0xb9;
-	*(uint32*)p = (uint32)runtime·callbackasm;
-	p += 4;
-
-	// CALL CX
-	*p++ = 0xff;
-	*p++ = 0xd1;
-
-	// RET argsize?
-	if(cleanstack && argsize!=0) {
-		*p++ = 0xc2;
-		*(uint16*)p = argsize;
-	} else
-		*p = 0xc3;
-
-	return &c->asmbody;
-}
diff --git a/src/pkg/runtime/callback_windows_amd64.c b/src/pkg/runtime/callback_windows_amd64.c
deleted file mode 100644
index 1a47792..0000000
--- a/src/pkg/runtime/callback_windows_amd64.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "runtime.h"
-#include "type.h"
-#include "typekind.h"
-#include "defs_GOOS_GOARCH.h"
-#include "os_GOOS.h"
-
-// Will keep all callbacks in a linked list, so they don't get garbage collected.
-typedef	struct	Callback	Callback;
-struct	Callback {
-	Callback*	link;
-	void*		gobody;
-	byte		asmbody;
-};
-
-typedef	struct	Callbacks	Callbacks;
-struct	Callbacks {
-	Lock;
-	Callback*	link;
-	int32		n;
-};
-
-static	Callbacks	cbs;
-
-// Call back from windows dll into go.
-byte *
-runtime·compilecallback(Eface fn, bool /*cleanstack*/)
-{
-	FuncType *ft;
-	Type *t;
-	int32 argsize, i, n;
-	byte *p;
-	Callback *c;
-
-	if(fn.type == nil || fn.type->kind != KindFunc)
-		runtime·panicstring("compilecallback: not a function");
-	ft = (FuncType*)fn.type;
-	if(ft->out.len != 1)
-		runtime·panicstring("compilecallback: function must have one output parameter");
-	if(((Type**)ft->out.array)[0]->size != sizeof(uintptr))
-		runtime·panicstring("compilecallback: output parameter size is wrong");
-	argsize = 0;
-	for(i=0; i<ft->in.len; i++) {
-		t = ((Type**)ft->in.array)[i];
-		if(t->size > sizeof(uintptr))
-			runtime·panicstring("compilecallback: input parameter size is wrong");
-		argsize += sizeof(uintptr);
-	}
-
-	// compute size of new fn.
-	// must match code laid out below.
-	n  = 2+8+1; // MOVQ fn, AX           / PUSHQ AX
-	n += 2+8+1; // MOVQ argsize, AX      / PUSHQ AX
-	n += 2+8;   // MOVQ callbackasm, AX
-	n += 2;     // JMP  AX
-
-	runtime·lock(&cbs);
-	for(c = cbs.link; c != nil; c = c->link) {
-		if(c->gobody == fn.data) {
-			runtime·unlock(&cbs);
-			return &c->asmbody;
-		}
-	}
-	if(cbs.n >= 2000)
-		runtime·throw("too many callback functions");
-	c = runtime·mal(sizeof *c + n);
-	c->gobody = fn.data;
-	c->link = cbs.link;
-	cbs.link = c;
-	cbs.n++;
-	runtime·unlock(&cbs);
-
-	p = &c->asmbody;
-
-	// MOVQ fn, AX
-	*p++ = 0x48;
-	*p++ = 0xb8;
-	*(uint64*)p = (uint64)(fn.data);
-	p += 8;
-	// PUSH AX
-	*p++ = 0x50;
-
-	// MOVQ argsize, AX
-	*p++ = 0x48;
-	*p++ = 0xb8;
-	*(uint64*)p = argsize;
-	p += 8;
-	// PUSH AX
-	*p++ = 0x50;
-
-	// MOVQ callbackasm, AX
-	*p++ = 0x48;
-	*p++ = 0xb8;
-	*(uint64*)p = (uint64)runtime·callbackasm;
-	p += 8;
-
-	// JMP AX
-	*p++ = 0xFF;
-	*p = 0xE0;
-
-	return &c->asmbody;
-}
diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h
index f62ee81..f5da54a 100644
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -78,6 +78,7 @@
 typedef	struct	Complex128	Complex128;
 typedef	struct	WinCall		WinCall;
 typedef	struct	SEH		SEH;
+typedef	struct	WinCallbackContext	WinCallbackContext;
 typedef	struct	Timers		Timers;
 typedef	struct	Timer		Timer;
 typedef	struct	GCStats		GCStats;
@@ -444,6 +445,13 @@
 	void*	prev;
 	void*	handler;
 };
+// describes how to handle callback
+struct	WinCallbackContext
+{
+	void*	gobody;		// Go function to call
+	uintptr	argsize;	// callback arguments size (in bytes)
+	uintptr	restorestack;	// adjust stack on return by (in bytes) (386 only)
+};
 
 #ifdef GOOS_windows
 enum {
diff --git a/src/pkg/runtime/sys_windows_386.s b/src/pkg/runtime/sys_windows_386.s
index 2c63b33..728fb99 100644
--- a/src/pkg/runtime/sys_windows_386.s
+++ b/src/pkg/runtime/sys_windows_386.s
@@ -164,19 +164,16 @@
 	POPL	BP
 	RET
 
-// Called from dynamic function created by ../thread.c compilecallback,
-// running on Windows stack (not Go stack).
-// BX, BP, SI, DI registers and DF flag are preserved
-// as required by windows callback convention.
-// AX = address of go func we need to call
-// DX = total size of arguments
-//
-TEXT runtime·callbackasm+0(SB),7,$0
-	// preserve whatever's at the memory location that
-	// the callback will use to store the return value
-	LEAL	8(SP), CX
-	PUSHL	0(CX)(DX*1)
-	ADDL	$4, DX			// extend argsize by size of return value
+GLOBL runtime·cbctxts(SB), $4
+
+TEXT runtime·callbackasm1+0(SB),7,$0
+  	MOVL	0(SP), AX	// will use to find our callback context
+
+	// remove return address from stack, we are not returning there
+	ADDL	$4, SP
+
+	// address to callback parameters into CX
+	LEAL	4(SP), CX
 
 	// save registers as required for windows callback
 	PUSHL	DI
@@ -189,19 +186,51 @@
 	PUSHL	0(FS)
 	MOVL	SP, 0(FS)
 
-	// callback parameters
-	PUSHL	DX
-	PUSHL	CX
-	PUSHL	AX
+	// determine index into runtime·cbctxts table
+	SUBL	$runtime·callbackasm(SB), AX
+	MOVL	$0, DX
+	MOVL	$5, BX	// divide by 5 because each call instruction in runtime·callbacks is 5 bytes long
+	DIVL	BX,
 
+	// find correspondent runtime·cbctxts table entry
+	MOVL	runtime·cbctxts(SB), BX
+	MOVL	-4(BX)(AX*4), BX
+
+	// extract callback context
+	MOVL	cbctxt_gobody(BX), AX
+	MOVL	cbctxt_argsize(BX), DX
+
+	// preserve whatever's at the memory location that
+	// the callback will use to store the return value
+	PUSHL	0(CX)(DX*1)
+
+	// extend argsize by size of return value
+	ADDL	$4, DX
+
+	// remember how to restore stack on return
+	MOVL	cbctxt_restorestack(BX), BX
+	PUSHL	BX
+
+	// call target Go function
+	PUSHL	DX			// argsize (including return value)
+	PUSHL	CX			// callback parameters
+	PUSHL	AX			// address of target Go function
 	CLD
-
 	CALL	runtime·cgocallback_gofunc(SB)
-
 	POPL	AX
 	POPL	CX
 	POPL	DX
 
+	// how to restore stack on return
+	POPL	BX
+
+	// return value into AX (as per Windows spec)
+	// and restore previously preserved value
+	MOVL	-4(CX)(DX*1), AX
+	POPL	-4(CX)(DX*1)
+
+	MOVL	BX, CX			// cannot use BX anymore
+
 	// pop SEH frame
 	POPL	0(FS)
 	POPL	BX
@@ -212,10 +241,13 @@
 	POPL	SI
 	POPL	DI
 
+	// remove callback parameters before return (as per Windows spec)
+	POPL	DX
+	ADDL	CX, SP
+	PUSHL	DX
+
 	CLD
 
-	MOVL	-4(CX)(DX*1), AX
-	POPL	-4(CX)(DX*1)
 	RET
 
 // void tstart(M *newm);
diff --git a/src/pkg/runtime/sys_windows_amd64.s b/src/pkg/runtime/sys_windows_amd64.s
index b9eaec6..ca07f57 100644
--- a/src/pkg/runtime/sys_windows_amd64.s
+++ b/src/pkg/runtime/sys_windows_amd64.s
@@ -196,32 +196,37 @@
 	POPQ	BP
 	RET
 
-// Continuation of thunk function created for each callback by ../thread.c compilecallback,
-// runs on Windows stack (not Go stack).
-// Thunk code designed to have minimal size for it is copied many (up to thousands) times.
-//
-// thunk:
-//	MOVQ	$fn, AX
-//	PUSHQ	AX
-//	MOVQ	$argsize, AX
-//	PUSHQ	AX
-//	MOVQ	$runtime·callbackasm, AX
-//	JMP	AX
-TEXT runtime·callbackasm(SB),7,$0
+GLOBL runtime·cbctxts(SB), $8
+
+TEXT runtime·callbackasm1(SB),7,$0
 	// Construct args vector for cgocallback().
 	// By windows/amd64 calling convention first 4 args are in CX, DX, R8, R9
 	// args from the 5th on are on the stack.
 	// In any case, even if function has 0,1,2,3,4 args, there is reserved
 	// but uninitialized "shadow space" for the first 4 args.
 	// The values are in registers.
-  	MOVQ	CX, (24+0)(SP)
-  	MOVQ	DX, (24+8)(SP)
-  	MOVQ	R8, (24+16)(SP)
-  	MOVQ	R9, (24+24)(SP)
-	// 6l does not accept writing POPQs here issuing a warning "unbalanced PUSH/POP"
-  	MOVQ	0(SP), DX	// POPQ DX
-  	MOVQ	8(SP), AX	// POPQ AX
-	ADDQ	$16, SP
+  	MOVQ	CX, (16+0)(SP)
+  	MOVQ	DX, (16+8)(SP)
+  	MOVQ	R8, (16+16)(SP)
+  	MOVQ	R9, (16+24)(SP)
+
+	// remove return address from stack, we are not returning there
+  	MOVQ	0(SP), AX
+	ADDQ	$8, SP
+
+	// determine index into runtime·cbctxts table
+	SUBQ	$runtime·callbackasm(SB), AX
+	MOVQ	$0, DX
+	MOVQ	$5, CX	// divide by 5 because each call instruction in runtime·callbacks is 5 bytes long
+	DIVL	CX,
+
+	// find correspondent runtime·cbctxts table entry
+	MOVQ	runtime·cbctxts(SB), CX
+	MOVQ	-8(CX)(AX*8), AX
+
+	// extract callback context
+	MOVQ	cbctxt_argsize(AX), DX
+	MOVQ	cbctxt_gobody(AX), AX
 
 	// preserve whatever's at the memory location that
 	// the callback will use to store the return value
@@ -231,8 +236,6 @@
 
 	// DI SI BP BX R12 R13 R14 R15 registers and DF flag are preserved
 	// as required by windows callback convention.
-	// 6l does not allow writing many PUSHQs here issuing a warning "nosplit stack overflow"
-	// the warning has no sense as this code uses os thread stack
 	PUSHFQ
 	SUBQ	$64, SP
 	MOVQ	DI, 56(SP)
@@ -247,18 +250,17 @@
 	// prepare call stack.  use SUBQ to hide from stack frame checks
 	// cgocallback(Go func, void *frame, uintptr framesize)
 	SUBQ	$24, SP
-	MOVQ	DX, 16(SP)	// uintptr framesize
-	MOVQ	CX, 8(SP)   // void *frame
-	MOVQ	AX, 0(SP)    // Go func
+	MOVQ	DX, 16(SP)	// argsize (including return value)
+	MOVQ	CX, 8(SP)	// callback parameters
+	MOVQ	AX, 0(SP)	// address of target Go function
 	CLD
-	CALL  runtime·cgocallback_gofunc(SB)
+	CALL	runtime·cgocallback_gofunc(SB)
 	MOVQ	0(SP), AX
 	MOVQ	8(SP), CX
 	MOVQ	16(SP), DX
 	ADDQ	$24, SP
 
 	// restore registers as required for windows callback
-	// 6l does not allow writing many POPs here issuing a warning "nosplit stack overflow"
 	MOVQ	0(SP), R15
 	MOVQ	8(SP), R14
 	MOVQ	16(SP), R13