amd64: use segment memory for thread-local storage
Returns R14 and R15 to the available register pool.
Plays more nicely with ELF ABI C code.
In particular, our signal handlers will no longer crash
when a signal arrives during execution of a cgo C call.

Fixes #720.

R=ken2, r
CC=golang-dev
https://golang.org/cl/1847051
diff --git a/src/cmd/6a/a.y b/src/cmd/6a/a.y
index 804f638..6341ba7 100644
--- a/src/cmd/6a/a.y
+++ b/src/cmd/6a/a.y
@@ -453,6 +453,12 @@
 		$$.type = D_INDIR+D_SP;
 		$$.offset = $1;
 	}
+|	con '(' LSREG ')'
+	{
+		$$ = nullgen;
+		$$.type = D_INDIR+$3;
+		$$.offset = $1;
+	}
 |	con '(' LLREG '*' con ')'
 	{
 		$$ = nullgen;
diff --git a/src/cmd/6c/cgen.c b/src/cmd/6c/cgen.c
index 39452c9..dd8573c 100644
--- a/src/cmd/6c/cgen.c
+++ b/src/cmd/6c/cgen.c
@@ -57,6 +57,12 @@
 	l = n->left;
 	r = n->right;
 	o = n->op;
+	
+	if(n->op == OEXREG || (nn != Z && nn->op == OEXREG)) {
+		gmove(n, nn);
+		return;
+	}
+
 	if(n->addable >= INDEXED) {
 		if(nn == Z) {
 			switch(o) {
diff --git a/src/cmd/6c/peep.c b/src/cmd/6c/peep.c
index 01793bf..13fd25e 100644
--- a/src/cmd/6c/peep.c
+++ b/src/cmd/6c/peep.c
@@ -797,8 +797,6 @@
 		return 3;
 
 	case ACALL:	/* funny */
-		if(REGEXT && v->type <= REGEXT && v->type > exregoffset)
-			return 2;
 		if(REGARG >= 0 && v->type == REGARG)
 			return 2;
 
diff --git a/src/cmd/6c/sgen.c b/src/cmd/6c/sgen.c
index b8247a1..42045f8 100644
--- a/src/cmd/6c/sgen.c
+++ b/src/cmd/6c/sgen.c
@@ -131,6 +131,10 @@
 			n->addable = 11;
 		break;
 
+	case OEXREG:
+		n->addable = 0;
+		break;
+
 	case OREGISTER:
 		n->addable = 12;
 		break;
diff --git a/src/cmd/6c/txt.c b/src/cmd/6c/txt.c
index f96c40f..9a94ca2 100644
--- a/src/cmd/6c/txt.c
+++ b/src/cmd/6c/txt.c
@@ -38,8 +38,6 @@
 
 	thechar = '6';
 	thestring = "amd64";
-	exregoffset = REGEXT;
-	exfregoffset = FREGEXT;
 	listinit();
 	nstring = 0;
 	mnstring = 0;
@@ -491,6 +489,10 @@
 		a->sym = S;
 		break;
 
+	case OEXREG:
+		a->type = D_INDIR + D_GS;
+		a->offset = n->reg - 1;
+		break;
 
 	case OIND:
 		naddr(n->left, a);
@@ -1502,11 +1504,11 @@
 	int32 o;
 
 	if(typechlpv[t->etype]) {
-		if(exregoffset <= REGEXT-4)
+		if(exregoffset >= 64)
 			return 0;
 		o = exregoffset;
-		exregoffset--;
-		return o;
+		exregoffset += 8;
+		return o+1;	// +1 to avoid 0 == failure; naddr's case OEXREG will subtract 1.
 	}
 	return 0;
 }
diff --git a/src/cmd/6l/asm.c b/src/cmd/6l/asm.c
index b45557e..fa419b6 100644
--- a/src/cmd/6l/asm.c
+++ b/src/cmd/6l/asm.c
@@ -821,6 +821,17 @@
 			ph->type = PT_DYNAMIC;
 			ph->flags = PF_R + PF_W;
 			phsh(ph, sh);
+			
+			/*
+			 * Thread-local storage segment (really just size).
+			 */
+			if(tlsoffset != 0) {
+				ph = newElfPhdr();
+				ph->type = PT_TLS;
+				ph->flags = PF_R;
+				ph->memsz = -tlsoffset;
+				ph->align = 8;
+			}
 		}
 
 		ph = newElfPhdr();
diff --git a/src/cmd/6l/l.h b/src/cmd/6l/l.h
index eb796e2..23ca223 100644
--- a/src/cmd/6l/l.h
+++ b/src/cmd/6l/l.h
@@ -340,6 +340,7 @@
 EXTERN	int32	symsize;
 EXTERN	Prog*	textp;
 EXTERN	vlong	textsize;
+EXTERN	int	tlsoffset;
 EXTERN	int	version;
 EXTERN	Prog	zprg;
 EXTERN	int	dtype;
diff --git a/src/cmd/6l/obj.c b/src/cmd/6l/obj.c
index 724f112..3b981a6 100644
--- a/src/cmd/6l/obj.c
+++ b/src/cmd/6l/obj.c
@@ -165,6 +165,11 @@
 			INITRND = 4096;
 		break;
 	case 6:	/* apple MACH */
+		/*
+		 * OS X system constant - offset from 0(GS) to our TLS.
+		 * Explained in ../../libcgo/darwin_amd64.c.
+		 */
+		tlsoffset = 0x8a0;
 		machoinit();
 		HEADR = MACHORESERVE;
 		if(INITRND == -1)
@@ -176,6 +181,13 @@
 		break;
 	case 7:	/* elf64 executable */
 	case 9: /* freebsd */
+		/*
+		 * ELF uses TLS offset negative from FS.
+		 * Translate 0(FS) and 8(FS) into -16(FS) and -8(FS).
+		 * Also known to ../../pkg/runtime/linux/amd64/sys.s
+		 * and ../../libcgo/linux_amd64.s.
+		 */
+		tlsoffset = -16;
 		elfinit();
 		HEADR = ELFRESERVE;
 		if(INITTEXT == -1)
@@ -434,6 +446,8 @@
 		adrgotype = zsym(pn, f, h);
 	s = a->sym;
 	t = a->type;
+	if(t == D_INDIR+D_GS)
+		a->offset += tlsoffset;
 	if(t != D_AUTO && t != D_PARAM) {
 		if(s && adrgotype)
 			s->gotype = adrgotype;
diff --git a/src/cmd/6l/pass.c b/src/cmd/6l/pass.c
index 8eced50..5fedee2 100644
--- a/src/cmd/6l/pass.c
+++ b/src/cmd/6l/pass.c
@@ -421,6 +421,13 @@
 	s = lookup("exit", 0);
 	vexit = s->value;
 	for(p = firstp; p != P; p = p->link) {
+		if(HEADTYPE == 7 || HEADTYPE == 9) {
+			// ELF uses FS instead of GS.
+			if(p->from.type == D_INDIR+D_GS)
+				p->from.type = D_INDIR+D_FS;
+			if(p->to.type == D_INDIR+D_GS)
+				p->to.type = D_INDIR+D_FS;
+		}
 		if(p->as == ATEXT)
 			curtext = p;
 		if(p->as == ACALL || (p->as == AJMP && p->to.type != D_BRANCH)) {
@@ -663,6 +670,15 @@
 				diag("nosplit func likely to overflow stack");
 
 			if(!(p->from.scale & NOSPLIT)) {
+				p = appendp(p);	// load g into CX
+				p->as = AMOVQ;
+				if(HEADTYPE == 7 || HEADTYPE == 9)	// ELF uses FS
+					p->from.type = D_INDIR+D_FS;
+				else
+					p->from.type = D_INDIR+D_GS;
+				p->from.offset = tlsoffset+0;
+				p->to.type = D_CX;
+				
 				if(debug['K']) {
 					// 6l -K means check not only for stack
 					// overflow but stack underflow.
@@ -672,7 +688,7 @@
 
 					p = appendp(p);
 					p->as = ACMPQ;
-					p->from.type = D_INDIR+D_R15;
+					p->from.type = D_INDIR+D_CX;
 					p->from.offset = 8;
 					p->to.type = D_SP;
 
@@ -694,7 +710,7 @@
 						p = appendp(p);
 						p->as = ACMPQ;
 						p->from.type = D_SP;
-						p->to.type = D_INDIR+D_R15;
+						p->to.type = D_INDIR+D_CX;
 						if(q1) {
 							q1->pcond = p;
 							q1 = P;
@@ -714,7 +730,7 @@
 						p = appendp(p);
 						p->as = ACMPQ;
 						p->from.type = D_AX;
-						p->to.type = D_INDIR+D_R15;
+						p->to.type = D_INDIR+D_CX;
 					}
 
 					// common
@@ -824,7 +840,7 @@
 				// function is marked as nosplit.
 				p = appendp(p);
 				p->as = AMOVQ;
-				p->from.type = D_INDIR+D_R15;
+				p->from.type = D_INDIR+D_CX;
 				p->from.offset = 0;
 				p->to.type = D_BX;
 
diff --git a/src/cmd/6l/span.c b/src/cmd/6l/span.c
index 15f931b..7e0086e 100644
--- a/src/cmd/6l/span.c
+++ b/src/cmd/6l/span.c
@@ -445,6 +445,24 @@
 }
 
 int
+prefixof(Adr *a)
+{
+	switch(a->type) {
+	case D_INDIR+D_CS:
+		return 0x2e;
+	case D_INDIR+D_DS:
+		return 0x3e;
+	case D_INDIR+D_ES:
+		return 0x26;
+	case D_INDIR+D_FS:
+		return 0x64;
+	case D_INDIR+D_GS:
+		return 0x65;
+	}
+	return 0;
+}
+
+int
 oclass(Adr *a)
 {
 	vlong v;
@@ -879,7 +897,7 @@
 	if(t >= D_INDIR) {
 		t -= D_INDIR;
 		rexflag |= (regrex[t] & Rxb) | rex;
-		if(t == D_NONE) {
+		if(t == D_NONE || (D_CS <= t && t <= D_GS)) {
 			if(asmode != 64){
 				*andptr++ = (0 << 6) | (5 << 0) | (r << 3);
 				put4(v);
@@ -1173,7 +1191,7 @@
 	Prog *q, pp;
 	uchar *t;
 	Movtab *mo;
-	int z, op, ft, tt, xo, l;
+	int z, op, ft, tt, xo, l, pre;
 	vlong v;
 
 	o = opindex[p->as];
@@ -1181,6 +1199,13 @@
 		diag("asmins: missing op %P", p);
 		return;
 	}
+	
+	pre = prefixof(&p->from);
+	if(pre)
+		*andptr++ = pre;
+	pre = prefixof(&p->to);
+	if(pre)
+		*andptr++ = pre;
 
 	if(p->ft == 0)
 		p->ft = oclass(&p->from);
@@ -1748,7 +1773,7 @@
 		n = andptr - and;
 		for(np = 0; np < n; np++) {
 			c = and[np];
-			if(c != 0x66 && c != 0xf2 && c != 0xf3 && c != 0x67)
+			if(c != 0xf2 && c != 0xf3 && (c < 0x64 || c > 0x67) && c != 0x2e && c != 0x3e && c != 0x26)
 				break;
 		}
 		memmove(and+np+1, and+np, n-np);
diff --git a/src/cmd/8c/txt.c b/src/cmd/8c/txt.c
index 194599c..4cfd7bc 100644
--- a/src/cmd/8c/txt.c
+++ b/src/cmd/8c/txt.c
@@ -1403,7 +1403,6 @@
 		return o+1;	// +1 to avoid 0 == failure; naddr case OEXREG will -1.
 	}
 
-	USED(t);
 	return 0;
 }
 
diff --git a/src/cmd/8l/obj.c b/src/cmd/8l/obj.c
index f3584bf..9067e94 100644
--- a/src/cmd/8l/obj.c
+++ b/src/cmd/8l/obj.c
@@ -227,7 +227,7 @@
 	case 7:	/* elf32 executable */
 	case 9:
 		/*
-		 * Linux ELF uses TLS offsets negative from %gs.
+		 * ELF uses TLS offsets negative from %gs.
 		 * Translate 0(GS) and 4(GS) into -8(GS) and -4(GS).
 		 * Also known to ../../pkg/runtime/linux/386/sys.s
 		 * and ../../libcgo/linux_386.c.
diff --git a/src/cmd/cc/com.c b/src/cmd/cc/com.c
index 5cbe8b7..b1a8a47 100644
--- a/src/cmd/cc/com.c
+++ b/src/cmd/cc/com.c
@@ -638,10 +638,10 @@
 		n->addable = 1;
 		if(n->class == CEXREG) {
 			n->op = OREGISTER;
-			// on 386, "extern register" generates
+			// on 386 or amd64, "extern register" generates
 			// memory references relative to the
-			// fs segment.
-			if(thechar == '8')	// [sic]
+			// gs or fs segment.
+			if(thechar == '8' || thechar == '6')	// [sic]
 				n->op = OEXREG;
 			n->reg = n->sym->offset;
 			n->xoffset = 0;