blob: 838a1681da2a6189d29ee0172485c00fda09221a [file] [log] [blame]
Rob Pike8e82a672008-06-30 11:50:36 -07001// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
Russ Cox15ced2d2014-11-11 17:06:22 -05005#include "go_asm.h"
6#include "go_tls.h"
Russ Cox9ddfb642013-07-16 16:24:09 -04007#include "funcdata.h"
Russ Coxcb040d52014-09-04 23:05:18 -04008#include "textflag.h"
Rob Pike8e82a672008-06-30 11:50:36 -07009
Ian Lance Taylorcf3f7712017-10-09 11:31:20 -070010// _rt0_amd64 is common startup code for most amd64 systems when using
11// internal linking. This is the entry point for the program from the
12// kernel for an ordinary -buildmode=exe program. The stack holds the
13// number of arguments and the C-style argv.
14TEXT _rt0_amd64(SB),NOSPLIT,$-8
15 MOVQ 0(SP), DI // argc
16 LEAQ 8(SP), SI // argv
17 JMP runtime·rt0_go(SB)
18
19// main is common startup code for most amd64 systems when using
20// external linking. The C startup code will call the symbol "main"
21// passing argc and argv in the usual C ABI registers DI and SI.
22TEXT main(SB),NOSPLIT,$-8
23 JMP runtime·rt0_go(SB)
24
Russ Cox7ba41e92014-09-03 11:11:16 -040025TEXT runtime·rt0_go(SB),NOSPLIT,$0
Rob Pike8e82a672008-06-30 11:50:36 -070026 // copy arguments forward on an even stack
Russ Cox36b414f2013-03-06 15:03:04 -050027 MOVQ DI, AX // argc
28 MOVQ SI, BX // argv
Rob Pike8e82a672008-06-30 11:50:36 -070029 SUBQ $(4*8+7), SP // 2args 2auto
Ian Lance Taylora4f8d362010-04-09 14:15:15 -070030 ANDQ $~15, SP
Rob Pike8e82a672008-06-30 11:50:36 -070031 MOVQ AX, 16(SP)
32 MOVQ BX, 24(SP)
Dmitriy Vyukov428062d2011-12-07 16:53:17 +030033
34 // create istack out of the given (operating system) stack.
Russ Coxf8d49b52013-02-28 16:24:38 -050035 // _cgo_init may update stackguard.
Dmitriy Vyukov428062d2011-12-07 16:53:17 +030036 MOVQ $runtime·g0(SB), DI
Alex Brainman8d6958f2012-01-20 12:59:44 +110037 LEAQ (-64*1024+104)(SP), BX
Russ Coxe6d35112015-01-05 16:29:21 +000038 MOVQ BX, g_stackguard0(DI)
39 MOVQ BX, g_stackguard1(DI)
Russ Cox15b76ad2014-09-09 13:39:57 -040040 MOVQ BX, (g_stack+stack_lo)(DI)
41 MOVQ SP, (g_stack+stack_hi)(DI)
Rob Pike8e82a672008-06-30 11:50:36 -070042
Keith Randalla5d40242013-03-12 10:47:44 -070043 // find out information about the processor we're on
Martin Möhrmann5a6c5802017-04-27 08:30:27 +020044 MOVL $0, AX
Keith Randalla5d40242013-03-12 10:47:44 -070045 CPUID
Martin Möhrmann5a6c5802017-04-27 08:30:27 +020046 MOVL AX, SI
47 CMPL AX, $0
Keith Randalla5d40242013-03-12 10:47:44 -070048 JE nocpuinfo
Dmitry Vyukov6e70fdd2015-02-17 14:25:49 +030049
50 // Figure out how to serialize RDTSC.
51 // On Intel processors LFENCE is enough. AMD requires MFENCE.
52 // Don't know about the rest, so let's do MFENCE.
53 CMPL BX, $0x756E6547 // "Genu"
54 JNE notintel
55 CMPL DX, $0x49656E69 // "ineI"
56 JNE notintel
57 CMPL CX, $0x6C65746E // "ntel"
58 JNE notintel
Martin Möhrmannb64e8172017-04-24 16:59:33 +020059 MOVB $1, runtime·isIntel(SB)
Dmitry Vyukov6e70fdd2015-02-17 14:25:49 +030060 MOVB $1, runtime·lfenceBeforeRdtsc(SB)
61notintel:
62
Keith Randall4b209db2016-03-29 21:25:33 -070063 // Load EAX=1 cpuid flags
Martin Möhrmann5a6c5802017-04-27 08:30:27 +020064 MOVL $1, AX
Keith Randalla5d40242013-03-12 10:47:44 -070065 CPUID
Martin Möhrmann5a6c5802017-04-27 08:30:27 +020066 MOVL AX, runtime·processorVersionInfo(SB)
Keith Randall4b209db2016-03-29 21:25:33 -070067
Martin Möhrmann5a6c5802017-04-27 08:30:27 +020068 TESTL $(1<<26), DX // SSE2
69 SETNE runtime·support_sse2(SB)
70
71 TESTL $(1<<9), CX // SSSE3
72 SETNE runtime·support_ssse3(SB)
73
74 TESTL $(1<<19), CX // SSE4.1
75 SETNE runtime·support_sse41(SB)
76
77 TESTL $(1<<20), CX // SSE4.2
78 SETNE runtime·support_sse42(SB)
79
80 TESTL $(1<<23), CX // POPCNT
81 SETNE runtime·support_popcnt(SB)
82
83 TESTL $(1<<25), CX // AES
84 SETNE runtime·support_aes(SB)
85
86 TESTL $(1<<27), CX // OSXSAVE
87 SETNE runtime·support_osxsave(SB)
88
89 // If OS support for XMM and YMM is not present
90 // support_avx will be set back to false later.
91 TESTL $(1<<28), CX // AVX
92 SETNE runtime·support_avx(SB)
93
94eax7:
Keith Randall4b209db2016-03-29 21:25:33 -070095 // Load EAX=7/ECX=0 cpuid flags
Martin Möhrmann5a6c5802017-04-27 08:30:27 +020096 CMPL SI, $7
97 JLT osavx
Keith Randall4b209db2016-03-29 21:25:33 -070098 MOVL $7, AX
99 MOVL $0, CX
100 CPUID
Martin Möhrmann5a6c5802017-04-27 08:30:27 +0200101
102 TESTL $(1<<3), BX // BMI1
103 SETNE runtime·support_bmi1(SB)
104
105 // If OS support for XMM and YMM is not present
106 // support_avx2 will be set back to false later.
107 TESTL $(1<<5), BX
108 SETNE runtime·support_avx2(SB)
109
110 TESTL $(1<<8), BX // BMI2
111 SETNE runtime·support_bmi2(SB)
112
113 TESTL $(1<<9), BX // ERMS
114 SETNE runtime·support_erms(SB)
115
116osavx:
117 CMPB runtime·support_osxsave(SB), $1
118 JNE noavx
119 MOVL $0, CX
Ilya Tocar0e23ca42015-10-28 23:20:26 +0300120 // For XGETBV, OSXSAVE bit is required and sufficient
Ilya Tocar1d1f2fb2016-01-13 16:43:22 +0300121 XGETBV
Martin Möhrmann5a6c5802017-04-27 08:30:27 +0200122 ANDL $6, AX
123 CMPL AX, $6 // Check for OS support of XMM and YMM registers.
124 JE nocpuinfo
Ilya Tocar0e23ca42015-10-28 23:20:26 +0300125noavx:
Martin Möhrmann5a6c5802017-04-27 08:30:27 +0200126 MOVB $0, runtime·support_avx(SB)
127 MOVB $0, runtime·support_avx2(SB)
128
129nocpuinfo:
Russ Coxf8d49b52013-02-28 16:24:38 -0500130 // if there is an _cgo_init, call it.
131 MOVQ _cgo_init(SB), AX
Ian Lance Taylora4f8d362010-04-09 14:15:15 -0700132 TESTQ AX, AX
Russ Coxe473f422010-08-04 17:50:22 -0700133 JZ needtls
Alex Brainman8d6958f2012-01-20 12:59:44 +1100134 // g0 already in DI
135 MOVQ DI, CX // Win64 uses CX for first parameter
Russ Cox89f185f2014-06-26 11:54:39 -0400136 MOVQ $setg_gcc<>(SB), SI
Alex Brainman8d6958f2012-01-20 12:59:44 +1100137 CALL AX
Russ Cox15b76ad2014-09-09 13:39:57 -0400138
Dmitriy Vyukovf5becf42013-06-03 12:28:24 +0400139 // update stackguard after _cgo_init
140 MOVQ $runtime·g0(SB), CX
Russ Cox15b76ad2014-09-09 13:39:57 -0400141 MOVQ (g_stack+stack_lo)(CX), AX
Russ Cox15ced2d2014-11-11 17:06:22 -0500142 ADDQ $const__StackGuard, AX
Russ Coxe6d35112015-01-05 16:29:21 +0000143 MOVQ AX, g_stackguard0(CX)
144 MOVQ AX, g_stackguard1(CX)
Russ Cox15b76ad2014-09-09 13:39:57 -0400145
Matthew Dempsky8ee0fd82015-06-09 15:24:38 -0700146#ifndef GOOS_windows
147 JMP ok
148#endif
Russ Coxe473f422010-08-04 17:50:22 -0700149needtls:
Matthew Dempsky8ee0fd82015-06-09 15:24:38 -0700150#ifdef GOOS_plan9
Akshat Kumara72bebf2012-08-31 13:21:13 -0400151 // skip TLS setup on Plan 9
Matthew Dempsky8ee0fd82015-06-09 15:24:38 -0700152 JMP ok
153#endif
154#ifdef GOOS_solaris
Aram Hăvărneanua46b4342014-01-17 17:58:10 +1300155 // skip TLS setup on Solaris
Matthew Dempsky8ee0fd82015-06-09 15:24:38 -0700156 JMP ok
157#endif
Akshat Kumara72bebf2012-08-31 13:21:13 -0400158
Matthew Dempsky7bb38f62015-11-12 15:35:50 -0800159 LEAQ runtime·m0+m_tls(SB), DI
Russ Cox68b42552010-11-04 14:00:19 -0400160 CALL runtime·settls(SB)
Russ Coxe473f422010-08-04 17:50:22 -0700161
162 // store through it, to make sure it works
163 get_tls(BX)
164 MOVQ $0x123, g(BX)
Matthew Dempsky7bb38f62015-11-12 15:35:50 -0800165 MOVQ runtime·m0+m_tls(SB), AX
Russ Coxe473f422010-08-04 17:50:22 -0700166 CMPQ AX, $0x123
167 JEQ 2(PC)
168 MOVL AX, 0 // abort
169ok:
170 // set the per-goroutine and per-mach "registers"
171 get_tls(BX)
Russ Cox68b42552010-11-04 14:00:19 -0400172 LEAQ runtime·g0(SB), CX
Russ Coxe473f422010-08-04 17:50:22 -0700173 MOVQ CX, g(BX)
Russ Cox68b42552010-11-04 14:00:19 -0400174 LEAQ runtime·m0(SB), AX
Russ Coxe473f422010-08-04 17:50:22 -0700175
176 // save m->g0 = g0
177 MOVQ CX, m_g0(AX)
Russ Cox89f185f2014-06-26 11:54:39 -0400178 // save m0 to g0->m
179 MOVQ AX, g_m(CX)
Rob Pike8e82a672008-06-30 11:50:36 -0700180
Ken Thompson8f53bc02008-12-15 15:07:35 -0800181 CLD // convention is D is always left cleared
Russ Cox68b42552010-11-04 14:00:19 -0400182 CALL runtime·check(SB)
Rob Pike8e82a672008-06-30 11:50:36 -0700183
Rob Pike8e82a672008-06-30 11:50:36 -0700184 MOVL 16(SP), AX // copy argc
185 MOVL AX, 0(SP)
186 MOVQ 24(SP), AX // copy argv
187 MOVQ AX, 8(SP)
Russ Cox68b42552010-11-04 14:00:19 -0400188 CALL runtime·args(SB)
189 CALL runtime·osinit(SB)
190 CALL runtime·schedinit(SB)
Russ Coxf7f63292008-08-05 14:21:42 -0700191
Ken Thompson751ce3a2008-07-11 19:16:39 -0700192 // create a new goroutine to start program
Michael Hudson-Doylef78dc1d2015-03-29 23:38:20 +0000193 MOVQ $runtime·mainPC(SB), AX // entry
Austin Clements20a6ff72015-01-27 18:29:02 -0500194 PUSHQ AX
Russ Cox7343e032009-06-17 15:12:16 -0700195 PUSHQ $0 // arg size
Russ Cox68b42552010-11-04 14:00:19 -0400196 CALL runtime·newproc(SB)
Russ Coxebd1eef2008-09-22 13:47:59 -0700197 POPQ AX
198 POPQ AX
Russ Cox79e1db22008-12-04 08:30:54 -0800199
Russ Coxebd1eef2008-09-22 13:47:59 -0700200 // start this M
Russ Cox68b42552010-11-04 14:00:19 -0400201 CALL runtime·mstart(SB)
Rob Pike8e82a672008-06-30 11:50:36 -0700202
Russ Cox36aa7d42012-03-08 14:03:56 -0500203 MOVL $0xf1, 0xf1 // crash
Rob Pike8e82a672008-06-30 11:50:36 -0700204 RET
205
Michael Hudson-Doylef78dc1d2015-03-29 23:38:20 +0000206DATA runtime·mainPC+0(SB)/8,$runtime·main(SB)
207GLOBL runtime·mainPC(SB),RODATA,$8
Russ Cox1903ad72013-02-21 17:01:13 -0500208
Keith Randall5a546962013-08-07 10:23:24 -0700209TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
Ken Thompson751ce3a2008-07-11 19:16:39 -0700210 BYTE $0xcc
Rob Pike8e82a672008-06-30 11:50:36 -0700211 RET
212
Keith Randall5a546962013-08-07 10:23:24 -0700213TEXT runtime·asminit(SB),NOSPLIT,$0-0
Russ Cox1707a992012-02-14 01:23:15 -0500214 // No per-thread init.
215 RET
216
Ken Thompson751ce3a2008-07-11 19:16:39 -0700217/*
218 * go-routine
219 */
Rob Piked3204ef2008-06-30 14:39:47 -0700220
Russ Coxf9ca3b52011-03-07 10:37:42 -0500221// void gosave(Gobuf*)
Russ Cox7343e032009-06-17 15:12:16 -0700222// save state in Gobuf; setjmp
Keith Randall5a546962013-08-07 10:23:24 -0700223TEXT runtime·gosave(SB), NOSPLIT, $0-8
Russ Cox25f6b022014-08-27 11:32:17 -0400224 MOVQ buf+0(FP), AX // gobuf
225 LEAQ buf+0(FP), BX // caller's SP
Russ Cox7343e032009-06-17 15:12:16 -0700226 MOVQ BX, gobuf_sp(AX)
227 MOVQ 0(SP), BX // caller's PC
228 MOVQ BX, gobuf_pc(AX)
Russ Coxd67e7e32013-06-12 15:22:26 -0400229 MOVQ $0, gobuf_ret(AX)
Austin Clements3c0fee12015-01-14 11:09:50 -0500230 MOVQ BP, gobuf_bp(AX)
Austin Clements70c107c2016-10-19 15:49:31 -0400231 // Assert ctxt is zero. See func save.
232 MOVQ gobuf_ctxt(AX), BX
233 TESTQ BX, BX
234 JZ 2(PC)
235 CALL runtime·badctxt(SB)
Russ Coxe473f422010-08-04 17:50:22 -0700236 get_tls(CX)
237 MOVQ g(CX), BX
238 MOVQ BX, gobuf_g(AX)
Ken Thompson751ce3a2008-07-11 19:16:39 -0700239 RET
240
Ian Lance Taylor06272482013-06-12 15:05:10 -0700241// void gogo(Gobuf*)
Russ Cox7343e032009-06-17 15:12:16 -0700242// restore state from Gobuf; longjmp
Austin Clements70c107c2016-10-19 15:49:31 -0400243TEXT runtime·gogo(SB), NOSPLIT, $16-8
Russ Cox25f6b022014-08-27 11:32:17 -0400244 MOVQ buf+0(FP), BX // gobuf
Austin Clements70c107c2016-10-19 15:49:31 -0400245
246 // If ctxt is not nil, invoke deletion barrier before overwriting.
247 MOVQ gobuf_ctxt(BX), AX
248 TESTQ AX, AX
249 JZ nilctxt
250 LEAQ gobuf_ctxt(BX), AX
251 MOVQ AX, 0(SP)
252 MOVQ $0, 8(SP)
253 CALL runtime·writebarrierptr_prewrite(SB)
254 MOVQ buf+0(FP), BX
255
256nilctxt:
Russ Coxe473f422010-08-04 17:50:22 -0700257 MOVQ gobuf_g(BX), DX
258 MOVQ 0(DX), CX // make sure g != nil
259 get_tls(CX)
260 MOVQ DX, g(CX)
Russ Cox7343e032009-06-17 15:12:16 -0700261 MOVQ gobuf_sp(BX), SP // restore SP
Russ Coxd67e7e32013-06-12 15:22:26 -0400262 MOVQ gobuf_ret(BX), AX
263 MOVQ gobuf_ctxt(BX), DX
Austin Clements3c0fee12015-01-14 11:09:50 -0500264 MOVQ gobuf_bp(BX), BP
Russ Coxd67e7e32013-06-12 15:22:26 -0400265 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector
266 MOVQ $0, gobuf_ret(BX)
267 MOVQ $0, gobuf_ctxt(BX)
Austin Clements3c0fee12015-01-14 11:09:50 -0500268 MOVQ $0, gobuf_bp(BX)
Russ Cox7343e032009-06-17 15:12:16 -0700269 MOVQ gobuf_pc(BX), BX
270 JMP BX
271
Russ Cox012ceed2014-09-03 11:35:22 -0400272// func mcall(fn func(*g))
Russ Coxf9ca3b52011-03-07 10:37:42 -0500273// Switch to m->g0's stack, call fn(g).
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +0000274// Fn must never return. It should gogo(&g->sched)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500275// to keep running g.
Keith Randall5a546962013-08-07 10:23:24 -0700276TEXT runtime·mcall(SB), NOSPLIT, $0-8
Russ Coxf9ca3b52011-03-07 10:37:42 -0500277 MOVQ fn+0(FP), DI
278
279 get_tls(CX)
Russ Cox528534c2013-06-05 07:16:53 -0400280 MOVQ g(CX), AX // save state in g->sched
Russ Coxf9ca3b52011-03-07 10:37:42 -0500281 MOVQ 0(SP), BX // caller's PC
282 MOVQ BX, (g_sched+gobuf_pc)(AX)
Russ Cox25f6b022014-08-27 11:32:17 -0400283 LEAQ fn+0(FP), BX // caller's SP
Russ Coxf9ca3b52011-03-07 10:37:42 -0500284 MOVQ BX, (g_sched+gobuf_sp)(AX)
285 MOVQ AX, (g_sched+gobuf_g)(AX)
Austin Clements3c0fee12015-01-14 11:09:50 -0500286 MOVQ BP, (g_sched+gobuf_bp)(AX)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500287
288 // switch to m->g0 & its stack, call fn
Russ Cox89f185f2014-06-26 11:54:39 -0400289 MOVQ g(CX), BX
290 MOVQ g_m(BX), BX
Russ Coxf9ca3b52011-03-07 10:37:42 -0500291 MOVQ m_g0(BX), SI
292 CMPQ SI, AX // if g == m->g0 call badmcall
Russ Cox9ddfb642013-07-16 16:24:09 -0400293 JNE 3(PC)
Keith Randall32b770b2013-08-29 15:53:34 -0700294 MOVQ $runtime·badmcall(SB), AX
295 JMP AX
Russ Coxf9ca3b52011-03-07 10:37:42 -0500296 MOVQ SI, g(CX) // g = m->g0
Russ Cox528534c2013-06-05 07:16:53 -0400297 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp
Russ Coxf9ca3b52011-03-07 10:37:42 -0500298 PUSHQ AX
Russ Cox012ceed2014-09-03 11:35:22 -0400299 MOVQ DI, DX
300 MOVQ 0(DI), DI
Russ Coxf9ca3b52011-03-07 10:37:42 -0500301 CALL DI
302 POPQ AX
Keith Randall32b770b2013-08-29 15:53:34 -0700303 MOVQ $runtime·badmcall2(SB), AX
304 JMP AX
Russ Coxf9ca3b52011-03-07 10:37:42 -0500305 RET
306
Russ Cox656be312014-11-12 14:54:31 -0500307// systemstack_switch is a dummy routine that systemstack leaves at the bottom
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +0000308// of the G stack. We need to distinguish the routine that
Keith Randall4aa50432014-07-30 09:01:52 -0700309// lives at the bottom of the G stack from the one that lives
Russ Cox656be312014-11-12 14:54:31 -0500310// at the top of the system stack because the one at the top of
311// the system stack terminates the stack walk (see topofstack()).
312TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
Keith Randall4aa50432014-07-30 09:01:52 -0700313 RET
314
Russ Cox656be312014-11-12 14:54:31 -0500315// func systemstack(fn func())
316TEXT runtime·systemstack(SB), NOSPLIT, $0-8
317 MOVQ fn+0(FP), DI // DI = fn
Russ Cox1d550b82014-09-11 12:08:30 -0400318 get_tls(CX)
319 MOVQ g(CX), AX // AX = g
320 MOVQ g_m(AX), BX // BX = m
Russ Cox656be312014-11-12 14:54:31 -0500321
Russ Cox1d550b82014-09-11 12:08:30 -0400322 MOVQ m_gsignal(BX), DX // DX = gsignal
323 CMPQ AX, DX
Russ Cox656be312014-11-12 14:54:31 -0500324 JEQ noswitch
Russ Cox32ecf572014-09-04 00:10:10 -0400325
Keith Randall4aa50432014-07-30 09:01:52 -0700326 MOVQ m_g0(BX), DX // DX = g0
327 CMPQ AX, DX
Russ Cox656be312014-11-12 14:54:31 -0500328 JEQ noswitch
Keith Randall4aa50432014-07-30 09:01:52 -0700329
Austin Clements20a6ff72015-01-27 18:29:02 -0500330 MOVQ m_curg(BX), R8
331 CMPQ AX, R8
Russ Cox656be312014-11-12 14:54:31 -0500332 JEQ switch
Russ Cox32ecf572014-09-04 00:10:10 -0400333
Russ Cox656be312014-11-12 14:54:31 -0500334 // Bad: g is not gsignal, not g0, not curg. What is it?
335 MOVQ $runtime·badsystemstack(SB), AX
Russ Cox32ecf572014-09-04 00:10:10 -0400336 CALL AX
337
Russ Cox656be312014-11-12 14:54:31 -0500338switch:
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +0000339 // save our state in g->sched. Pretend to
Russ Cox656be312014-11-12 14:54:31 -0500340 // be systemstack_switch if the G stack is scanned.
Austin Clements20a6ff72015-01-27 18:29:02 -0500341 MOVQ $runtime·systemstack_switch(SB), SI
342 MOVQ SI, (g_sched+gobuf_pc)(AX)
Keith Randall4aa50432014-07-30 09:01:52 -0700343 MOVQ SP, (g_sched+gobuf_sp)(AX)
344 MOVQ AX, (g_sched+gobuf_g)(AX)
Austin Clements3c0fee12015-01-14 11:09:50 -0500345 MOVQ BP, (g_sched+gobuf_bp)(AX)
Keith Randall4aa50432014-07-30 09:01:52 -0700346
347 // switch to g0
348 MOVQ DX, g(CX)
Russ Coxd16a2ad2014-09-04 22:48:08 -0400349 MOVQ (g_sched+gobuf_sp)(DX), BX
Russ Cox656be312014-11-12 14:54:31 -0500350 // make it look like mstart called systemstack on g0, to stop traceback
Russ Coxd16a2ad2014-09-04 22:48:08 -0400351 SUBQ $8, BX
352 MOVQ $runtime·mstart(SB), DX
353 MOVQ DX, 0(BX)
354 MOVQ BX, SP
Keith Randall4aa50432014-07-30 09:01:52 -0700355
356 // call target function
Russ Cox012ceed2014-09-03 11:35:22 -0400357 MOVQ DI, DX
358 MOVQ 0(DI), DI
Keith Randall4aa50432014-07-30 09:01:52 -0700359 CALL DI
360
361 // switch back to g
362 get_tls(CX)
363 MOVQ g(CX), AX
364 MOVQ g_m(AX), BX
365 MOVQ m_curg(BX), AX
366 MOVQ AX, g(CX)
367 MOVQ (g_sched+gobuf_sp)(AX), SP
368 MOVQ $0, (g_sched+gobuf_sp)(AX)
369 RET
370
Russ Cox656be312014-11-12 14:54:31 -0500371noswitch:
Keith Randall4aa50432014-07-30 09:01:52 -0700372 // already on m stack, just call directly
Russ Cox012ceed2014-09-03 11:35:22 -0400373 MOVQ DI, DX
374 MOVQ 0(DI), DI
Keith Randall4aa50432014-07-30 09:01:52 -0700375 CALL DI
376 RET
377
Rob Pike2da97832008-07-12 11:30:53 -0700378/*
379 * support for morestack
380 */
381
Russ Cox7343e032009-06-17 15:12:16 -0700382// Called during function prolog when more stack is needed.
Russ Cox58f12ff2013-07-18 16:53:45 -0400383//
384// The traceback routines see morestack on a g0 as being
385// the top of a stack (for example, morestack calling newstack
386// calling the scheduler calling newm calling gc), so we must
387// record an argument size. For that purpose, it has no arguments.
Keith Randall5a546962013-08-07 10:23:24 -0700388TEXT runtime·morestack(SB),NOSPLIT,$0-0
Russ Coxe473f422010-08-04 17:50:22 -0700389 // Cannot grow scheduler stack (m->g0).
Anthony Martin2302b212014-09-10 06:25:05 -0700390 get_tls(CX)
Russ Cox15b76ad2014-09-09 13:39:57 -0400391 MOVQ g(CX), BX
392 MOVQ g_m(BX), BX
Russ Coxe473f422010-08-04 17:50:22 -0700393 MOVQ m_g0(BX), SI
394 CMPQ g(CX), SI
Austin Clements687d9d52016-10-13 10:44:57 -0400395 JNE 3(PC)
396 CALL runtime·badmorestackg0(SB)
Russ Coxe473f422010-08-04 17:50:22 -0700397 INT $3
398
Russ Coxf8f630f2014-09-05 16:51:45 -0400399 // Cannot grow signal stack (m->gsignal).
400 MOVQ m_gsignal(BX), SI
401 CMPQ g(CX), SI
Austin Clements687d9d52016-10-13 10:44:57 -0400402 JNE 3(PC)
403 CALL runtime·badmorestackgsignal(SB)
Russ Coxf8f630f2014-09-05 16:51:45 -0400404 INT $3
405
Russ Cox7343e032009-06-17 15:12:16 -0700406 // Called from f.
407 // Set m->morebuf to f's caller.
408 MOVQ 8(SP), AX // f's caller's PC
Russ Coxe473f422010-08-04 17:50:22 -0700409 MOVQ AX, (m_morebuf+gobuf_pc)(BX)
Russ Cox7343e032009-06-17 15:12:16 -0700410 LEAQ 16(SP), AX // f's caller's SP
Russ Coxe473f422010-08-04 17:50:22 -0700411 MOVQ AX, (m_morebuf+gobuf_sp)(BX)
Russ Coxe473f422010-08-04 17:50:22 -0700412 get_tls(CX)
413 MOVQ g(CX), SI
414 MOVQ SI, (m_morebuf+gobuf_g)(BX)
Russ Cox7343e032009-06-17 15:12:16 -0700415
Russ Cox6fa3c892013-06-27 11:32:01 -0400416 // Set g->sched to context in f.
417 MOVQ 0(SP), AX // f's PC
418 MOVQ AX, (g_sched+gobuf_pc)(SI)
419 MOVQ SI, (g_sched+gobuf_g)(SI)
420 LEAQ 8(SP), AX // f's SP
421 MOVQ AX, (g_sched+gobuf_sp)(SI)
Austin Clements3c0fee12015-01-14 11:09:50 -0500422 MOVQ BP, (g_sched+gobuf_bp)(SI)
Austin Clementsbf9c71c2016-10-19 18:27:39 -0400423 // newstack will fill gobuf.ctxt.
Russ Cox7343e032009-06-17 15:12:16 -0700424
Russ Coxf9ca3b52011-03-07 10:37:42 -0500425 // Call newstack on m->g0's stack.
Austin Clements20a6ff72015-01-27 18:29:02 -0500426 MOVQ m_g0(BX), BX
427 MOVQ BX, g(CX)
428 MOVQ (g_sched+gobuf_sp)(BX), SP
Austin Clementsbf9c71c2016-10-19 18:27:39 -0400429 PUSHQ DX // ctxt argument
Russ Cox68b42552010-11-04 14:00:19 -0400430 CALL runtime·newstack(SB)
Russ Cox7343e032009-06-17 15:12:16 -0700431 MOVQ $0, 0x1003 // crash if newstack returns
Austin Clementsbf9c71c2016-10-19 18:27:39 -0400432 POPQ DX // keep balance check happy
Russ Cox7343e032009-06-17 15:12:16 -0700433 RET
434
Russ Cox15b76ad2014-09-09 13:39:57 -0400435// morestack but not preserving ctxt.
436TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
437 MOVL $0, DX
438 JMP runtime·morestack(SB)
439
Keith Randall52631982014-09-08 10:14:41 -0700440// reflectcall: call a function with the given argument list
Russ Coxdf027ac2014-12-30 13:59:55 -0500441// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
Keith Randall9cd57062013-08-02 13:03:14 -0700442// we don't have variable-sized frames, so we use a small number
443// of constant-sized-frame functions to encode a few bits of size in the pc.
444// Caution: ugly multiline assembly macros in your future!
445
446#define DISPATCH(NAME,MAXSIZE) \
447 CMPQ CX, $MAXSIZE; \
448 JA 3(PC); \
Russ Coxcb6f5ac2014-10-15 13:12:16 -0400449 MOVQ $NAME(SB), AX; \
Keith Randall9cd57062013-08-02 13:03:14 -0700450 JMP AX
Rob Pikeaff78832014-07-30 10:11:44 -0700451// Note: can't just "JMP NAME(SB)" - bad inlining results.
Keith Randall9cd57062013-08-02 13:03:14 -0700452
Russ Cox7a524a12014-12-22 13:27:53 -0500453TEXT reflect·call(SB), NOSPLIT, $0-0
454 JMP ·reflectcall(SB)
455
Russ Coxdf027ac2014-12-30 13:59:55 -0500456TEXT ·reflectcall(SB), NOSPLIT, $0-32
457 MOVLQZX argsize+24(FP), CX
Rob Pikeaff78832014-07-30 10:11:44 -0700458 DISPATCH(runtime·call32, 32)
459 DISPATCH(runtime·call64, 64)
460 DISPATCH(runtime·call128, 128)
461 DISPATCH(runtime·call256, 256)
462 DISPATCH(runtime·call512, 512)
463 DISPATCH(runtime·call1024, 1024)
464 DISPATCH(runtime·call2048, 2048)
465 DISPATCH(runtime·call4096, 4096)
466 DISPATCH(runtime·call8192, 8192)
467 DISPATCH(runtime·call16384, 16384)
468 DISPATCH(runtime·call32768, 32768)
469 DISPATCH(runtime·call65536, 65536)
470 DISPATCH(runtime·call131072, 131072)
471 DISPATCH(runtime·call262144, 262144)
472 DISPATCH(runtime·call524288, 524288)
473 DISPATCH(runtime·call1048576, 1048576)
474 DISPATCH(runtime·call2097152, 2097152)
475 DISPATCH(runtime·call4194304, 4194304)
476 DISPATCH(runtime·call8388608, 8388608)
477 DISPATCH(runtime·call16777216, 16777216)
478 DISPATCH(runtime·call33554432, 33554432)
479 DISPATCH(runtime·call67108864, 67108864)
480 DISPATCH(runtime·call134217728, 134217728)
481 DISPATCH(runtime·call268435456, 268435456)
482 DISPATCH(runtime·call536870912, 536870912)
483 DISPATCH(runtime·call1073741824, 1073741824)
Keith Randall9cd57062013-08-02 13:03:14 -0700484 MOVQ $runtime·badreflectcall(SB), AX
485 JMP AX
486
Keith Randall12e46e42013-08-06 14:33:55 -0700487#define CALLFN(NAME,MAXSIZE) \
Russ Coxdf027ac2014-12-30 13:59:55 -0500488TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \
Russ Coxcb6f5ac2014-10-15 13:12:16 -0400489 NO_LOCAL_POINTERS; \
Keith Randall9cd57062013-08-02 13:03:14 -0700490 /* copy arguments to stack */ \
Russ Coxdf027ac2014-12-30 13:59:55 -0500491 MOVQ argptr+16(FP), SI; \
492 MOVLQZX argsize+24(FP), CX; \
Keith Randall9cd57062013-08-02 13:03:14 -0700493 MOVQ SP, DI; \
494 REP;MOVSB; \
495 /* call function */ \
Russ Coxdf027ac2014-12-30 13:59:55 -0500496 MOVQ f+8(FP), DX; \
Keith Randallcee8bca2014-05-21 14:28:34 -0700497 PCDATA $PCDATA_StackMapIndex, $0; \
Keith Randall9cd57062013-08-02 13:03:14 -0700498 CALL (DX); \
499 /* copy return values back */ \
Austin Clements79561a82016-10-20 22:45:18 -0400500 MOVQ argtype+0(FP), DX; \
Russ Coxdf027ac2014-12-30 13:59:55 -0500501 MOVQ argptr+16(FP), DI; \
502 MOVLQZX argsize+24(FP), CX; \
Austin Clements79561a82016-10-20 22:45:18 -0400503 MOVLQZX retoffset+28(FP), BX; \
Keith Randall9cd57062013-08-02 13:03:14 -0700504 MOVQ SP, SI; \
Russ Cox72c5d5e2014-04-08 11:11:35 -0400505 ADDQ BX, DI; \
506 ADDQ BX, SI; \
507 SUBQ BX, CX; \
Austin Clements79561a82016-10-20 22:45:18 -0400508 CALL callRet<>(SB); \
509 RET
510
511// callRet copies return values back at the end of call*. This is a
512// separate function so it can allocate stack space for the arguments
513// to reflectcallmove. It does not follow the Go ABI; it expects its
514// arguments in registers.
515TEXT callRet<>(SB), NOSPLIT, $32-0
516 NO_LOCAL_POINTERS
517 MOVQ DX, 0(SP)
518 MOVQ DI, 8(SP)
519 MOVQ SI, 16(SP)
520 MOVQ CX, 24(SP)
521 CALL runtime·reflectcallmove(SB)
Keith Randall9cd57062013-08-02 13:03:14 -0700522 RET
523
Russ Coxcb6f5ac2014-10-15 13:12:16 -0400524CALLFN(·call32, 32)
525CALLFN(·call64, 64)
526CALLFN(·call128, 128)
527CALLFN(·call256, 256)
528CALLFN(·call512, 512)
529CALLFN(·call1024, 1024)
530CALLFN(·call2048, 2048)
531CALLFN(·call4096, 4096)
532CALLFN(·call8192, 8192)
533CALLFN(·call16384, 16384)
534CALLFN(·call32768, 32768)
535CALLFN(·call65536, 65536)
536CALLFN(·call131072, 131072)
537CALLFN(·call262144, 262144)
538CALLFN(·call524288, 524288)
539CALLFN(·call1048576, 1048576)
540CALLFN(·call2097152, 2097152)
541CALLFN(·call4194304, 4194304)
542CALLFN(·call8388608, 8388608)
543CALLFN(·call16777216, 16777216)
544CALLFN(·call33554432, 33554432)
545CALLFN(·call67108864, 67108864)
546CALLFN(·call134217728, 134217728)
547CALLFN(·call268435456, 268435456)
548CALLFN(·call536870912, 536870912)
549CALLFN(·call1073741824, 1073741824)
Keith Randall9cd57062013-08-02 13:03:14 -0700550
Keith Randall5a546962013-08-07 10:23:24 -0700551TEXT runtime·procyield(SB),NOSPLIT,$0-0
Russ Cox25f6b022014-08-27 11:32:17 -0400552 MOVL cycles+0(FP), AX
Dmitriy Vyukov4e5086b2011-07-29 12:44:06 -0400553again:
554 PAUSE
555 SUBL $1, AX
556 JNZ again
557 RET
558
Russ Cox631d6a32015-03-19 19:42:16 -0400559
Austin Clementsf5d494b2015-06-15 12:30:23 -0400560TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
561 // Stores are already ordered on x86, so this is just a
562 // compile barrier.
563 RET
564
Russ Coxaa3222d82009-06-02 23:02:12 -0700565// void jmpdefer(fn, sp);
566// called from deferreturn.
Ken Thompson1e1cc4e2009-01-27 12:03:53 -0800567// 1. pop the caller
568// 2. sub 5 bytes from the callers return
569// 3. jmp to the argument
Keith Randalla97a91d2013-08-07 14:03:50 -0700570TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
Russ Cox25f6b022014-08-27 11:32:17 -0400571 MOVQ fv+0(FP), DX // fn
572 MOVQ argp+8(FP), BX // caller sp
Russ Coxaa3222d82009-06-02 23:02:12 -0700573 LEAQ -8(BX), SP // caller sp after CALL
Austin Clementsb92f4232016-05-25 20:56:56 -0400574 MOVQ -8(SP), BP // restore BP as if deferreturn returned (harmless if framepointers not in use)
Russ Coxaa3222d82009-06-02 23:02:12 -0700575 SUBQ $5, (SP) // return to CALL again
Russ Cox6066fdc2013-02-22 10:47:54 -0500576 MOVQ 0(DX), BX
Russ Cox1903ad72013-02-21 17:01:13 -0500577 JMP BX // but first run the deferred function
Russ Cox133a1582009-10-03 10:37:12 -0700578
Russ Coxd67e7e32013-06-12 15:22:26 -0400579// Save state of caller into g->sched. Smashes R8, R9.
Keith Randall5a546962013-08-07 10:23:24 -0700580TEXT gosave<>(SB),NOSPLIT,$0
Russ Coxd67e7e32013-06-12 15:22:26 -0400581 get_tls(R8)
582 MOVQ g(R8), R8
583 MOVQ 0(SP), R9
584 MOVQ R9, (g_sched+gobuf_pc)(R8)
585 LEAQ 8(SP), R9
586 MOVQ R9, (g_sched+gobuf_sp)(R8)
587 MOVQ $0, (g_sched+gobuf_ret)(R8)
Austin Clements3c0fee12015-01-14 11:09:50 -0500588 MOVQ BP, (g_sched+gobuf_bp)(R8)
Austin Clements70c107c2016-10-19 15:49:31 -0400589 // Assert ctxt is zero. See func save.
590 MOVQ (g_sched+gobuf_ctxt)(R8), R9
591 TESTQ R9, R9
592 JZ 2(PC)
593 CALL runtime·badctxt(SB)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500594 RET
595
Alex Brainman9d968cb2015-04-27 17:32:23 +1000596// func asmcgocall(fn, arg unsafe.Pointer) int32
Russ Coxadd89dd2009-10-12 10:26:38 -0700597// Call fn(arg) on the scheduler stack,
598// aligned appropriately for the gcc ABI.
Alex Brainman9d968cb2015-04-27 17:32:23 +1000599// See cgocall.go for more details.
600TEXT ·asmcgocall(SB),NOSPLIT,$0-20
Russ Coxf9ca3b52011-03-07 10:37:42 -0500601 MOVQ fn+0(FP), AX
602 MOVQ arg+8(FP), BX
Russ Coxcb767242014-09-04 00:01:55 -0400603
Russ Coxf9ca3b52011-03-07 10:37:42 -0500604 MOVQ SP, DX
Russ Coxadd89dd2009-10-12 10:26:38 -0700605
606 // Figure out if we need to switch to m->g0 stack.
Russ Coxf9ca3b52011-03-07 10:37:42 -0500607 // We get called to create new OS threads too, and those
608 // come in on the m->g0 stack already.
609 get_tls(CX)
Austin Clements20a6ff72015-01-27 18:29:02 -0500610 MOVQ g(CX), R8
Russ Cox3af29fb2015-11-19 15:51:39 -0500611 CMPQ R8, $0
612 JEQ nosave
Austin Clements20a6ff72015-01-27 18:29:02 -0500613 MOVQ g_m(R8), R8
614 MOVQ m_g0(R8), SI
Russ Coxf9ca3b52011-03-07 10:37:42 -0500615 MOVQ g(CX), DI
616 CMPQ SI, DI
Aram Hăvărneanua46b4342014-01-17 17:58:10 +1300617 JEQ nosave
Austin Clements20a6ff72015-01-27 18:29:02 -0500618 MOVQ m_gsignal(R8), SI
Aram Hăvărneanua46b4342014-01-17 17:58:10 +1300619 CMPQ SI, DI
620 JEQ nosave
621
Russ Cox3af29fb2015-11-19 15:51:39 -0500622 // Switch to system stack.
Austin Clements20a6ff72015-01-27 18:29:02 -0500623 MOVQ m_g0(R8), SI
Russ Coxd67e7e32013-06-12 15:22:26 -0400624 CALL gosave<>(SB)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500625 MOVQ SI, g(CX)
626 MOVQ (g_sched+gobuf_sp)(SI), SP
Russ Coxadd89dd2009-10-12 10:26:38 -0700627
628 // Now on a scheduling stack (a pthread-created stack).
Alex Brainman7f075ec2012-09-03 12:12:51 +1000629 // Make sure we have enough room for 4 stack-backed fast-call
630 // registers as per windows amd64 calling convention.
631 SUBQ $64, SP
Russ Cox133a1582009-10-03 10:37:12 -0700632 ANDQ $~15, SP // alignment for gcc ABI
Alex Brainman7f075ec2012-09-03 12:12:51 +1000633 MOVQ DI, 48(SP) // save g
Keith Randall47f251c2014-09-11 20:36:23 -0700634 MOVQ (g_stack+stack_hi)(DI), DI
635 SUBQ DX, DI
636 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500637 MOVQ BX, DI // DI = first argument in AMD64 ABI
Wei Guangjing9f636592011-07-19 10:47:33 -0400638 MOVQ BX, CX // CX = first argument in Win64
Russ Coxf9ca3b52011-03-07 10:37:42 -0500639 CALL AX
Russ Coxadd89dd2009-10-12 10:26:38 -0700640
Russ Coxe473f422010-08-04 17:50:22 -0700641 // Restore registers, g, stack pointer.
Russ Coxf9ca3b52011-03-07 10:37:42 -0500642 get_tls(CX)
Alex Brainman7f075ec2012-09-03 12:12:51 +1000643 MOVQ 48(SP), DI
Keith Randall47f251c2014-09-11 20:36:23 -0700644 MOVQ (g_stack+stack_hi)(DI), SI
645 SUBQ 40(SP), SI
Russ Coxf9ca3b52011-03-07 10:37:42 -0500646 MOVQ DI, g(CX)
Keith Randall47f251c2014-09-11 20:36:23 -0700647 MOVQ SI, SP
Alex Brainman9d968cb2015-04-27 17:32:23 +1000648
649 MOVL AX, ret+16(FP)
Russ Cox133a1582009-10-03 10:37:12 -0700650 RET
651
Russ Cox3af29fb2015-11-19 15:51:39 -0500652nosave:
653 // Running on a system stack, perhaps even without a g.
654 // Having no g can happen during thread creation or thread teardown
655 // (see needm/dropm on Solaris, for example).
656 // This code is like the above sequence but without saving/restoring g
657 // and without worrying about the stack moving out from under us
658 // (because we're on a system stack, not a goroutine stack).
659 // The above code could be used directly if already on a system stack,
660 // but then the only path through this code would be a rare case on Solaris.
661 // Using this code for all "already on system stack" calls exercises it more,
662 // which should help keep it correct.
663 SUBQ $64, SP
664 ANDQ $~15, SP
665 MOVQ $0, 48(SP) // where above code stores g, in case someone looks during debugging
666 MOVQ DX, 40(SP) // save original stack pointer
667 MOVQ BX, DI // DI = first argument in AMD64 ABI
668 MOVQ BX, CX // CX = first argument in Win64
669 CALL AX
670 MOVQ 40(SP), SI // restore original stack pointer
671 MOVQ SI, SP
672 MOVL AX, ret+16(FP)
673 RET
674
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700675// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
Russ Cox3d2dfc52013-02-22 16:08:56 -0500676// Turn the fn into a Go func (by taking its address) and call
677// cgocallback_gofunc.
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700678TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
Russ Cox3d2dfc52013-02-22 16:08:56 -0500679 LEAQ fn+0(FP), AX
680 MOVQ AX, 0(SP)
681 MOVQ frame+8(FP), AX
682 MOVQ AX, 8(SP)
683 MOVQ framesize+16(FP), AX
684 MOVQ AX, 16(SP)
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700685 MOVQ ctxt+24(FP), AX
686 MOVQ AX, 24(SP)
Russ Cox3d2dfc52013-02-22 16:08:56 -0500687 MOVQ $runtime·cgocallback_gofunc(SB), AX
688 CALL AX
689 RET
690
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700691// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
Alex Brainman9d968cb2015-04-27 17:32:23 +1000692// See cgocall.go for more details.
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700693TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
Russ Coxe844f532014-09-12 07:46:11 -0400694 NO_LOCAL_POINTERS
695
Russ Cox89f185f2014-06-26 11:54:39 -0400696 // If g is nil, Go did not create the current thread.
697 // Call needm to obtain one m for temporary use.
Russ Cox6c976392013-02-20 17:48:23 -0500698 // In this case, we're running on the thread stack, so there's
699 // lots of space, but the linker doesn't know. Hide the call from
700 // the linker analysis by using an indirect call through AX.
701 get_tls(CX)
702#ifdef GOOS_windows
Austin Clements20a6ff72015-01-27 18:29:02 -0500703 MOVL $0, BX
Russ Cox6c976392013-02-20 17:48:23 -0500704 CMPQ CX, $0
Russ Coxcefdb9c2013-07-23 22:59:32 -0400705 JEQ 2(PC)
Russ Cox6c976392013-02-20 17:48:23 -0500706#endif
Austin Clements20a6ff72015-01-27 18:29:02 -0500707 MOVQ g(CX), BX
708 CMPQ BX, $0
Russ Cox89f185f2014-06-26 11:54:39 -0400709 JEQ needm
Austin Clements20a6ff72015-01-27 18:29:02 -0500710 MOVQ g_m(BX), BX
711 MOVQ BX, R8 // holds oldm until end of function
Russ Cox89f185f2014-06-26 11:54:39 -0400712 JMP havem
Russ Cox6c976392013-02-20 17:48:23 -0500713needm:
Russ Cox89f185f2014-06-26 11:54:39 -0400714 MOVQ $0, 0(SP)
Russ Cox6c976392013-02-20 17:48:23 -0500715 MOVQ $runtime·needm(SB), AX
716 CALL AX
Russ Coxf0112822013-07-24 09:01:57 -0400717 MOVQ 0(SP), R8
Russ Coxe473f422010-08-04 17:50:22 -0700718 get_tls(CX)
Austin Clements20a6ff72015-01-27 18:29:02 -0500719 MOVQ g(CX), BX
720 MOVQ g_m(BX), BX
Russ Coxc4efaac2014-10-28 21:53:09 -0400721
722 // Set m->sched.sp = SP, so that if a panic happens
723 // during the function we are about to execute, it will
724 // have a valid SP to run on the g0 stack.
725 // The next few lines (after the havem label)
726 // will save this SP onto the stack and then write
727 // the same SP back to m->sched.sp. That seems redundant,
728 // but if an unrecovered panic happens, unwindm will
729 // restore the g->sched.sp from the stack location
Russ Cox656be312014-11-12 14:54:31 -0500730 // and then systemstack will try to use it. If we don't set it here,
Russ Coxc4efaac2014-10-28 21:53:09 -0400731 // that restored SP will be uninitialized (typically 0) and
732 // will not be usable.
Austin Clements20a6ff72015-01-27 18:29:02 -0500733 MOVQ m_g0(BX), SI
Russ Coxc4efaac2014-10-28 21:53:09 -0400734 MOVQ SP, (g_sched+gobuf_sp)(SI)
Russ Cox9b732382012-03-08 12:12:40 -0500735
Russ Cox6c976392013-02-20 17:48:23 -0500736havem:
737 // Now there's a valid m, and we're running on its m->g0.
738 // Save current m->g0->sched.sp on stack and then set it to SP.
739 // Save current sp in m->g0->sched.sp in preparation for
740 // switch back to m->curg stack.
Russ Coxdba623b2013-07-23 18:40:02 -0400741 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
Austin Clements20a6ff72015-01-27 18:29:02 -0500742 MOVQ m_g0(BX), SI
Russ Coxdba623b2013-07-23 18:40:02 -0400743 MOVQ (g_sched+gobuf_sp)(SI), AX
744 MOVQ AX, 0(SP)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500745 MOVQ SP, (g_sched+gobuf_sp)(SI)
Ian Lance Taylor2d0ff3f2010-04-09 13:30:11 -0700746
Russ Coxdba623b2013-07-23 18:40:02 -0400747 // Switch to m->curg stack and call runtime.cgocallbackg.
748 // Because we are taking over the execution of m->curg
749 // but *not* resuming what had been running, we need to
750 // save that information (m->curg->sched) so we can restore it.
Russ Cox528534c2013-06-05 07:16:53 -0400751 // We can restore m->curg->sched.sp easily, because calling
Alex Brainman72e83482011-08-18 12:17:09 -0400752 // runtime.cgocallbackg leaves SP unchanged upon return.
Russ Cox528534c2013-06-05 07:16:53 -0400753 // To save m->curg->sched.pc, we push it onto the stack.
Russ Coxf9ca3b52011-03-07 10:37:42 -0500754 // This has the added benefit that it looks to the traceback
Alex Brainman72e83482011-08-18 12:17:09 -0400755 // routine like cgocallbackg is going to return to that
Russ Coxdba623b2013-07-23 18:40:02 -0400756 // PC (because the frame we allocate below has the same
757 // size as cgocallback_gofunc's frame declared above)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500758 // so that the traceback will seamlessly trace back into
759 // the earlier calls.
Russ Coxdba623b2013-07-23 18:40:02 -0400760 //
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700761 // In the new goroutine, 8(SP) holds the saved R8.
Austin Clements20a6ff72015-01-27 18:29:02 -0500762 MOVQ m_curg(BX), SI
Russ Coxf9ca3b52011-03-07 10:37:42 -0500763 MOVQ SI, g(CX)
764 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI
Austin Clements20a6ff72015-01-27 18:29:02 -0500765 MOVQ (g_sched+gobuf_pc)(SI), BX
766 MOVQ BX, -8(DI)
Austin Clements3c0fee12015-01-14 11:09:50 -0500767 // Compute the size of the frame, including return PC and, if
Keith Randall1ea60c12016-12-02 15:17:52 -0800768 // GOEXPERIMENT=framepointer, the saved base pointer
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700769 MOVQ ctxt+24(FP), BX
Rob Pikec21f1d52015-02-19 13:44:06 -0800770 LEAQ fv+0(FP), AX
Austin Clements3c0fee12015-01-14 11:09:50 -0500771 SUBQ SP, AX
772 SUBQ AX, DI
773 MOVQ DI, SP
774
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700775 MOVQ R8, 8(SP)
776 MOVQ BX, 0(SP)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500777 CALL runtime·cgocallbackg(SB)
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700778 MOVQ 8(SP), R8
Russ Coxf9ca3b52011-03-07 10:37:42 -0500779
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +0000780 // Compute the size of the frame again. FP and SP have
Austin Clements3c0fee12015-01-14 11:09:50 -0500781 // completely different values here than they did above,
782 // but only their difference matters.
Rob Pikec21f1d52015-02-19 13:44:06 -0800783 LEAQ fv+0(FP), AX
Austin Clements3c0fee12015-01-14 11:09:50 -0500784 SUBQ SP, AX
785
Russ Cox528534c2013-06-05 07:16:53 -0400786 // Restore g->sched (== m->curg->sched) from saved values.
Russ Coxe473f422010-08-04 17:50:22 -0700787 get_tls(CX)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500788 MOVQ g(CX), SI
Austin Clements3c0fee12015-01-14 11:09:50 -0500789 MOVQ SP, DI
790 ADDQ AX, DI
791 MOVQ -8(DI), BX
Austin Clements20a6ff72015-01-27 18:29:02 -0500792 MOVQ BX, (g_sched+gobuf_pc)(SI)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500793 MOVQ DI, (g_sched+gobuf_sp)(SI)
794
795 // Switch back to m->g0's stack and restore m->g0->sched.sp.
796 // (Unlike m->curg, the g0 goroutine never uses sched.pc,
797 // so we do not have to restore it.)
Austin Clements20a6ff72015-01-27 18:29:02 -0500798 MOVQ g(CX), BX
799 MOVQ g_m(BX), BX
800 MOVQ m_g0(BX), SI
Russ Coxf9ca3b52011-03-07 10:37:42 -0500801 MOVQ SI, g(CX)
802 MOVQ (g_sched+gobuf_sp)(SI), SP
Russ Coxdba623b2013-07-23 18:40:02 -0400803 MOVQ 0(SP), AX
804 MOVQ AX, (g_sched+gobuf_sp)(SI)
Russ Cox6c976392013-02-20 17:48:23 -0500805
806 // If the m on entry was nil, we called needm above to borrow an m
807 // for the duration of the call. Since the call is over, return it with dropm.
Russ Coxf0112822013-07-24 09:01:57 -0400808 CMPQ R8, $0
Russ Cox6c976392013-02-20 17:48:23 -0500809 JNE 3(PC)
810 MOVQ $runtime·dropm(SB), AX
811 CALL AX
Russ Coxf9ca3b52011-03-07 10:37:42 -0500812
813 // Done!
Ian Lance Taylor2d0ff3f2010-04-09 13:30:11 -0700814 RET
815
Russ Cox89f185f2014-06-26 11:54:39 -0400816// void setg(G*); set g. for use by needm.
Russ Cox25f6b022014-08-27 11:32:17 -0400817TEXT runtime·setg(SB), NOSPLIT, $0-8
Russ Cox89f185f2014-06-26 11:54:39 -0400818 MOVQ gg+0(FP), BX
Russ Cox6c976392013-02-20 17:48:23 -0500819#ifdef GOOS_windows
Russ Cox89f185f2014-06-26 11:54:39 -0400820 CMPQ BX, $0
Russ Cox6c976392013-02-20 17:48:23 -0500821 JNE settls
822 MOVQ $0, 0x28(GS)
823 RET
824settls:
Russ Cox89f185f2014-06-26 11:54:39 -0400825 MOVQ g_m(BX), AX
Russ Cox6c976392013-02-20 17:48:23 -0500826 LEAQ m_tls(AX), AX
827 MOVQ AX, 0x28(GS)
828#endif
829 get_tls(CX)
Russ Cox6c976392013-02-20 17:48:23 -0500830 MOVQ BX, g(CX)
831 RET
832
Russ Cox89f185f2014-06-26 11:54:39 -0400833// void setg_gcc(G*); set g called from gcc.
834TEXT setg_gcc<>(SB),NOSPLIT,$0
Russ Cox6a70f9d2013-03-25 18:14:02 -0400835 get_tls(AX)
Russ Cox89f185f2014-06-26 11:54:39 -0400836 MOVQ DI, g(AX)
Russ Cox6a70f9d2013-03-25 18:14:02 -0400837 RET
838
Russ Cox15b76ad2014-09-09 13:39:57 -0400839// check that SP is in range [g->stack.lo, g->stack.hi)
Keith Randall5a546962013-08-07 10:23:24 -0700840TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
Russ Coxe473f422010-08-04 17:50:22 -0700841 get_tls(CX)
842 MOVQ g(CX), AX
Russ Cox15b76ad2014-09-09 13:39:57 -0400843 CMPQ (g_stack+stack_hi)(AX), SP
Russ Cox01eaf782010-03-30 10:53:16 -0700844 JHI 2(PC)
845 INT $3
Russ Cox15b76ad2014-09-09 13:39:57 -0400846 CMPQ SP, (g_stack+stack_lo)(AX)
Russ Cox01eaf782010-03-30 10:53:16 -0700847 JHI 2(PC)
848 INT $3
849 RET
850
Dmitry Vyukov6e70fdd2015-02-17 14:25:49 +0300851// func cputicks() int64
Keith Randall5a546962013-08-07 10:23:24 -0700852TEXT runtime·cputicks(SB),NOSPLIT,$0-0
Dmitry Vyukov6e70fdd2015-02-17 14:25:49 +0300853 CMPB runtime·lfenceBeforeRdtsc(SB), $1
854 JNE mfence
Ilya Tocar1d1f2fb2016-01-13 16:43:22 +0300855 LFENCE
Dmitry Vyukov6e70fdd2015-02-17 14:25:49 +0300856 JMP done
857mfence:
Ilya Tocar1d1f2fb2016-01-13 16:43:22 +0300858 MFENCE
Dmitry Vyukov6e70fdd2015-02-17 14:25:49 +0300859done:
Damian Gryski8e765da2012-02-02 14:09:27 -0500860 RDTSC
861 SHLQ $32, DX
862 ADDQ DX, AX
Russ Cox25f6b022014-08-27 11:32:17 -0400863 MOVQ AX, ret+0(FP)
Damian Gryski8e765da2012-02-02 14:09:27 -0500864 RET
865
Keith Randalla5d40242013-03-12 10:47:44 -0700866// hash function using AES hardware instructions
Keith Randalla2a97682014-07-31 15:07:05 -0700867TEXT runtime·aeshash(SB),NOSPLIT,$0-32
868 MOVQ p+0(FP), AX // ptr to data
Keith Randalld5e4c402015-01-06 16:42:48 -0800869 MOVQ s+16(FP), CX // size
870 LEAQ ret+24(FP), DX
Keith Randalla5d40242013-03-12 10:47:44 -0700871 JMP runtime·aeshashbody(SB)
872
Keith Randalld5e4c402015-01-06 16:42:48 -0800873TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
Keith Randalla2a97682014-07-31 15:07:05 -0700874 MOVQ p+0(FP), AX // ptr to string struct
Keith Randalla5d40242013-03-12 10:47:44 -0700875 MOVQ 8(AX), CX // length of string
876 MOVQ (AX), AX // string data
Keith Randalld5e4c402015-01-06 16:42:48 -0800877 LEAQ ret+16(FP), DX
Keith Randalla5d40242013-03-12 10:47:44 -0700878 JMP runtime·aeshashbody(SB)
879
880// AX: data
881// CX: length
Keith Randalld5e4c402015-01-06 16:42:48 -0800882// DX: address to put return value
883TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
Keith Randall91059de2015-08-31 16:26:12 -0700884 // Fill an SSE register with our seeds.
885 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed
886 PINSRW $4, CX, X0 // 16 bits of length
887 PSHUFHW $0, X0, X0 // repeat length 4 times total
888 MOVO X0, X1 // save unscrambled seed
889 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed
890 AESENC X0, X0 // scramble seed
891
Keith Randallee669722013-05-15 09:40:14 -0700892 CMPQ CX, $16
Keith Randall7a4a64e2014-12-10 14:20:17 -0800893 JB aes0to15
894 JE aes16
895 CMPQ CX, $32
896 JBE aes17to32
897 CMPQ CX, $64
898 JBE aes33to64
899 CMPQ CX, $128
900 JBE aes65to128
901 JMP aes129plus
Keith Randalla5d40242013-03-12 10:47:44 -0700902
Keith Randall7a4a64e2014-12-10 14:20:17 -0800903aes0to15:
904 TESTQ CX, CX
905 JE aes0
906
907 ADDQ $16, AX
908 TESTW $0xff0, AX
909 JE endofpage
Keith Randalla5d40242013-03-12 10:47:44 -0700910
Keith Randallee669722013-05-15 09:40:14 -0700911 // 16 bytes loaded at this address won't cross
912 // a page boundary, so we can load it directly.
Keith Randall91059de2015-08-31 16:26:12 -0700913 MOVOU -16(AX), X1
Keith Randalla5d40242013-03-12 10:47:44 -0700914 ADDQ CX, CX
Austin Clements20a6ff72015-01-27 18:29:02 -0500915 MOVQ $masks<>(SB), AX
Keith Randall91059de2015-08-31 16:26:12 -0700916 PAND (AX)(CX*8), X1
917final1:
Keith Randallc83e6f52016-05-26 08:56:49 -0700918 PXOR X0, X1 // xor data with seed
919 AESENC X1, X1 // scramble combo 3 times
920 AESENC X1, X1
Keith Randall91059de2015-08-31 16:26:12 -0700921 AESENC X1, X1
922 MOVQ X1, (DX)
Keith Randall7a4a64e2014-12-10 14:20:17 -0800923 RET
924
925endofpage:
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +0000926 // address ends in 1111xxxx. Might be up against
Keith Randalla5d40242013-03-12 10:47:44 -0700927 // a page boundary, so load ending at last byte.
928 // Then shift bytes down using pshufb.
Keith Randall91059de2015-08-31 16:26:12 -0700929 MOVOU -32(AX)(CX*1), X1
Keith Randalla5d40242013-03-12 10:47:44 -0700930 ADDQ CX, CX
Austin Clements20a6ff72015-01-27 18:29:02 -0500931 MOVQ $shifts<>(SB), AX
Keith Randall91059de2015-08-31 16:26:12 -0700932 PSHUFB (AX)(CX*8), X1
933 JMP final1
Keith Randalla5d40242013-03-12 10:47:44 -0700934
Keith Randall7a4a64e2014-12-10 14:20:17 -0800935aes0:
Keith Randall731bdc52015-09-01 12:53:15 -0700936 // Return scrambled input seed
Keith Randall91059de2015-08-31 16:26:12 -0700937 AESENC X0, X0
938 MOVQ X0, (DX)
Keith Randall7a4a64e2014-12-10 14:20:17 -0800939 RET
940
941aes16:
Keith Randall91059de2015-08-31 16:26:12 -0700942 MOVOU (AX), X1
943 JMP final1
Keith Randall7a4a64e2014-12-10 14:20:17 -0800944
945aes17to32:
Keith Randall91059de2015-08-31 16:26:12 -0700946 // make second starting seed
947 PXOR runtime·aeskeysched+16(SB), X1
948 AESENC X1, X1
949
Keith Randall7a4a64e2014-12-10 14:20:17 -0800950 // load data to be hashed
Keith Randall91059de2015-08-31 16:26:12 -0700951 MOVOU (AX), X2
952 MOVOU -16(AX)(CX*1), X3
Keith Randall7a4a64e2014-12-10 14:20:17 -0800953
Keith Randallc83e6f52016-05-26 08:56:49 -0700954 // xor with seed
955 PXOR X0, X2
956 PXOR X1, X3
957
Keith Randall7a4a64e2014-12-10 14:20:17 -0800958 // scramble 3 times
Keith Randallc83e6f52016-05-26 08:56:49 -0700959 AESENC X2, X2
960 AESENC X3, X3
Keith Randall91059de2015-08-31 16:26:12 -0700961 AESENC X2, X2
962 AESENC X3, X3
963 AESENC X2, X2
964 AESENC X3, X3
Keith Randall7a4a64e2014-12-10 14:20:17 -0800965
966 // combine results
Keith Randall91059de2015-08-31 16:26:12 -0700967 PXOR X3, X2
968 MOVQ X2, (DX)
Keith Randall7a4a64e2014-12-10 14:20:17 -0800969 RET
970
971aes33to64:
Keith Randall91059de2015-08-31 16:26:12 -0700972 // make 3 more starting seeds
973 MOVO X1, X2
974 MOVO X1, X3
975 PXOR runtime·aeskeysched+16(SB), X1
976 PXOR runtime·aeskeysched+32(SB), X2
977 PXOR runtime·aeskeysched+48(SB), X3
978 AESENC X1, X1
979 AESENC X2, X2
980 AESENC X3, X3
Keith Randall7a4a64e2014-12-10 14:20:17 -0800981
Keith Randall91059de2015-08-31 16:26:12 -0700982 MOVOU (AX), X4
983 MOVOU 16(AX), X5
984 MOVOU -32(AX)(CX*1), X6
985 MOVOU -16(AX)(CX*1), X7
Keith Randallc83e6f52016-05-26 08:56:49 -0700986
987 PXOR X0, X4
988 PXOR X1, X5
989 PXOR X2, X6
990 PXOR X3, X7
Keith Randall91059de2015-08-31 16:26:12 -0700991
Keith Randallc83e6f52016-05-26 08:56:49 -0700992 AESENC X4, X4
993 AESENC X5, X5
994 AESENC X6, X6
995 AESENC X7, X7
Keith Randall91059de2015-08-31 16:26:12 -0700996
997 AESENC X4, X4
998 AESENC X5, X5
999 AESENC X6, X6
1000 AESENC X7, X7
1001
1002 AESENC X4, X4
1003 AESENC X5, X5
1004 AESENC X6, X6
1005 AESENC X7, X7
Keith Randall7a4a64e2014-12-10 14:20:17 -08001006
Keith Randall91059de2015-08-31 16:26:12 -07001007 PXOR X6, X4
1008 PXOR X7, X5
1009 PXOR X5, X4
1010 MOVQ X4, (DX)
Keith Randall7a4a64e2014-12-10 14:20:17 -08001011 RET
1012
1013aes65to128:
Keith Randall91059de2015-08-31 16:26:12 -07001014 // make 7 more starting seeds
1015 MOVO X1, X2
1016 MOVO X1, X3
1017 MOVO X1, X4
1018 MOVO X1, X5
1019 MOVO X1, X6
1020 MOVO X1, X7
1021 PXOR runtime·aeskeysched+16(SB), X1
1022 PXOR runtime·aeskeysched+32(SB), X2
1023 PXOR runtime·aeskeysched+48(SB), X3
1024 PXOR runtime·aeskeysched+64(SB), X4
1025 PXOR runtime·aeskeysched+80(SB), X5
1026 PXOR runtime·aeskeysched+96(SB), X6
1027 PXOR runtime·aeskeysched+112(SB), X7
1028 AESENC X1, X1
1029 AESENC X2, X2
1030 AESENC X3, X3
1031 AESENC X4, X4
1032 AESENC X5, X5
1033 AESENC X6, X6
1034 AESENC X7, X7
Keith Randall7a4a64e2014-12-10 14:20:17 -08001035
Keith Randall91059de2015-08-31 16:26:12 -07001036 // load data
1037 MOVOU (AX), X8
1038 MOVOU 16(AX), X9
1039 MOVOU 32(AX), X10
1040 MOVOU 48(AX), X11
1041 MOVOU -64(AX)(CX*1), X12
1042 MOVOU -48(AX)(CX*1), X13
1043 MOVOU -32(AX)(CX*1), X14
1044 MOVOU -16(AX)(CX*1), X15
1045
Keith Randallc83e6f52016-05-26 08:56:49 -07001046 // xor with seed
1047 PXOR X0, X8
1048 PXOR X1, X9
1049 PXOR X2, X10
1050 PXOR X3, X11
1051 PXOR X4, X12
1052 PXOR X5, X13
1053 PXOR X6, X14
1054 PXOR X7, X15
Keith Randall91059de2015-08-31 16:26:12 -07001055
Keith Randallc83e6f52016-05-26 08:56:49 -07001056 // scramble 3 times
Keith Randall91059de2015-08-31 16:26:12 -07001057 AESENC X8, X8
1058 AESENC X9, X9
1059 AESENC X10, X10
1060 AESENC X11, X11
1061 AESENC X12, X12
1062 AESENC X13, X13
1063 AESENC X14, X14
1064 AESENC X15, X15
Keith Randallc83e6f52016-05-26 08:56:49 -07001065
1066 AESENC X8, X8
1067 AESENC X9, X9
1068 AESENC X10, X10
1069 AESENC X11, X11
1070 AESENC X12, X12
1071 AESENC X13, X13
1072 AESENC X14, X14
1073 AESENC X15, X15
1074
Keith Randall91059de2015-08-31 16:26:12 -07001075 AESENC X8, X8
1076 AESENC X9, X9
1077 AESENC X10, X10
1078 AESENC X11, X11
1079 AESENC X12, X12
1080 AESENC X13, X13
1081 AESENC X14, X14
1082 AESENC X15, X15
1083
1084 // combine results
1085 PXOR X12, X8
1086 PXOR X13, X9
1087 PXOR X14, X10
1088 PXOR X15, X11
1089 PXOR X10, X8
1090 PXOR X11, X9
1091 PXOR X9, X8
1092 MOVQ X8, (DX)
Keith Randall7a4a64e2014-12-10 14:20:17 -08001093 RET
1094
1095aes129plus:
Keith Randall91059de2015-08-31 16:26:12 -07001096 // make 7 more starting seeds
1097 MOVO X1, X2
1098 MOVO X1, X3
1099 MOVO X1, X4
1100 MOVO X1, X5
1101 MOVO X1, X6
1102 MOVO X1, X7
1103 PXOR runtime·aeskeysched+16(SB), X1
1104 PXOR runtime·aeskeysched+32(SB), X2
1105 PXOR runtime·aeskeysched+48(SB), X3
1106 PXOR runtime·aeskeysched+64(SB), X4
1107 PXOR runtime·aeskeysched+80(SB), X5
1108 PXOR runtime·aeskeysched+96(SB), X6
1109 PXOR runtime·aeskeysched+112(SB), X7
1110 AESENC X1, X1
1111 AESENC X2, X2
1112 AESENC X3, X3
1113 AESENC X4, X4
1114 AESENC X5, X5
1115 AESENC X6, X6
1116 AESENC X7, X7
1117
Keith Randall7a4a64e2014-12-10 14:20:17 -08001118 // start with last (possibly overlapping) block
Keith Randall91059de2015-08-31 16:26:12 -07001119 MOVOU -128(AX)(CX*1), X8
1120 MOVOU -112(AX)(CX*1), X9
1121 MOVOU -96(AX)(CX*1), X10
1122 MOVOU -80(AX)(CX*1), X11
1123 MOVOU -64(AX)(CX*1), X12
1124 MOVOU -48(AX)(CX*1), X13
1125 MOVOU -32(AX)(CX*1), X14
1126 MOVOU -16(AX)(CX*1), X15
Keith Randall7a4a64e2014-12-10 14:20:17 -08001127
Keith Randallc83e6f52016-05-26 08:56:49 -07001128 // xor in seed
1129 PXOR X0, X8
1130 PXOR X1, X9
1131 PXOR X2, X10
1132 PXOR X3, X11
1133 PXOR X4, X12
1134 PXOR X5, X13
1135 PXOR X6, X14
1136 PXOR X7, X15
Keith Randall91059de2015-08-31 16:26:12 -07001137
Keith Randall7a4a64e2014-12-10 14:20:17 -08001138 // compute number of remaining 128-byte blocks
1139 DECQ CX
1140 SHRQ $7, CX
1141
1142aesloop:
Keith Randallc83e6f52016-05-26 08:56:49 -07001143 // scramble state
1144 AESENC X8, X8
1145 AESENC X9, X9
1146 AESENC X10, X10
1147 AESENC X11, X11
1148 AESENC X12, X12
1149 AESENC X13, X13
1150 AESENC X14, X14
1151 AESENC X15, X15
1152
Keith Randall7a4a64e2014-12-10 14:20:17 -08001153 // scramble state, xor in a block
Keith Randall91059de2015-08-31 16:26:12 -07001154 MOVOU (AX), X0
1155 MOVOU 16(AX), X1
1156 MOVOU 32(AX), X2
1157 MOVOU 48(AX), X3
1158 AESENC X0, X8
1159 AESENC X1, X9
1160 AESENC X2, X10
1161 AESENC X3, X11
1162 MOVOU 64(AX), X4
1163 MOVOU 80(AX), X5
1164 MOVOU 96(AX), X6
1165 MOVOU 112(AX), X7
1166 AESENC X4, X12
1167 AESENC X5, X13
1168 AESENC X6, X14
1169 AESENC X7, X15
Keith Randall7a4a64e2014-12-10 14:20:17 -08001170
Keith Randallc83e6f52016-05-26 08:56:49 -07001171 ADDQ $128, AX
1172 DECQ CX
1173 JNE aesloop
1174
1175 // 3 more scrambles to finish
Keith Randall91059de2015-08-31 16:26:12 -07001176 AESENC X8, X8
1177 AESENC X9, X9
1178 AESENC X10, X10
1179 AESENC X11, X11
1180 AESENC X12, X12
1181 AESENC X13, X13
1182 AESENC X14, X14
1183 AESENC X15, X15
Keith Randall91059de2015-08-31 16:26:12 -07001184 AESENC X8, X8
1185 AESENC X9, X9
1186 AESENC X10, X10
1187 AESENC X11, X11
1188 AESENC X12, X12
1189 AESENC X13, X13
1190 AESENC X14, X14
1191 AESENC X15, X15
1192 AESENC X8, X8
1193 AESENC X9, X9
1194 AESENC X10, X10
1195 AESENC X11, X11
1196 AESENC X12, X12
1197 AESENC X13, X13
1198 AESENC X14, X14
1199 AESENC X15, X15
Keith Randall7a4a64e2014-12-10 14:20:17 -08001200
Keith Randall91059de2015-08-31 16:26:12 -07001201 PXOR X12, X8
1202 PXOR X13, X9
1203 PXOR X14, X10
1204 PXOR X15, X11
1205 PXOR X10, X8
1206 PXOR X11, X9
1207 PXOR X9, X8
1208 MOVQ X8, (DX)
Keith Randall7a4a64e2014-12-10 14:20:17 -08001209 RET
1210
Keith Randalld5e4c402015-01-06 16:42:48 -08001211TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
Keith Randalla2a97682014-07-31 15:07:05 -07001212 MOVQ p+0(FP), AX // ptr to data
Keith Randalld5e4c402015-01-06 16:42:48 -08001213 MOVQ h+8(FP), X0 // seed
Keith Randalla5d40242013-03-12 10:47:44 -07001214 PINSRD $2, (AX), X0 // data
Keith Randalldb53d972013-03-20 14:34:26 -07001215 AESENC runtime·aeskeysched+0(SB), X0
1216 AESENC runtime·aeskeysched+16(SB), X0
Keith Randall7a4a64e2014-12-10 14:20:17 -08001217 AESENC runtime·aeskeysched+32(SB), X0
Keith Randalld5e4c402015-01-06 16:42:48 -08001218 MOVQ X0, ret+16(FP)
Keith Randalla5d40242013-03-12 10:47:44 -07001219 RET
1220
Keith Randalld5e4c402015-01-06 16:42:48 -08001221TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
Keith Randalla2a97682014-07-31 15:07:05 -07001222 MOVQ p+0(FP), AX // ptr to data
Keith Randalld5e4c402015-01-06 16:42:48 -08001223 MOVQ h+8(FP), X0 // seed
Keith Randalla5d40242013-03-12 10:47:44 -07001224 PINSRQ $1, (AX), X0 // data
Keith Randalldb53d972013-03-20 14:34:26 -07001225 AESENC runtime·aeskeysched+0(SB), X0
1226 AESENC runtime·aeskeysched+16(SB), X0
Keith Randall7a4a64e2014-12-10 14:20:17 -08001227 AESENC runtime·aeskeysched+32(SB), X0
Keith Randalld5e4c402015-01-06 16:42:48 -08001228 MOVQ X0, ret+16(FP)
Keith Randalla5d40242013-03-12 10:47:44 -07001229 RET
1230
1231// simple mask to get rid of data in the high part of the register.
Russ Cox9ddfb642013-07-16 16:24:09 -04001232DATA masks<>+0x00(SB)/8, $0x0000000000000000
1233DATA masks<>+0x08(SB)/8, $0x0000000000000000
1234DATA masks<>+0x10(SB)/8, $0x00000000000000ff
1235DATA masks<>+0x18(SB)/8, $0x0000000000000000
1236DATA masks<>+0x20(SB)/8, $0x000000000000ffff
1237DATA masks<>+0x28(SB)/8, $0x0000000000000000
1238DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
1239DATA masks<>+0x38(SB)/8, $0x0000000000000000
1240DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
1241DATA masks<>+0x48(SB)/8, $0x0000000000000000
1242DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
1243DATA masks<>+0x58(SB)/8, $0x0000000000000000
1244DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
1245DATA masks<>+0x68(SB)/8, $0x0000000000000000
1246DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
1247DATA masks<>+0x78(SB)/8, $0x0000000000000000
1248DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
1249DATA masks<>+0x88(SB)/8, $0x0000000000000000
1250DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
1251DATA masks<>+0x98(SB)/8, $0x00000000000000ff
1252DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
1253DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
1254DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
1255DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
1256DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
1257DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
1258DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
1259DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
1260DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
1261DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
1262DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
1263DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
Keith Randall5a546962013-08-07 10:23:24 -07001264GLOBL masks<>(SB),RODATA,$256
Keith Randalla5d40242013-03-12 10:47:44 -07001265
Shenghou Ma3583a442015-09-03 02:44:26 -04001266TEXT ·checkASM(SB),NOSPLIT,$0-1
1267 // check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
1268 MOVQ $masks<>(SB), AX
1269 MOVQ $shifts<>(SB), BX
1270 ORQ BX, AX
1271 TESTQ $15, AX
1272 SETEQ ret+0(FP)
1273 RET
1274
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +00001275// these are arguments to pshufb. They move data down from
Russ Cox9ddfb642013-07-16 16:24:09 -04001276// the high bytes of the register to the low bytes of the register.
1277// index is how many bytes to move.
1278DATA shifts<>+0x00(SB)/8, $0x0000000000000000
1279DATA shifts<>+0x08(SB)/8, $0x0000000000000000
1280DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
1281DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
1282DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
1283DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
1284DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
1285DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
1286DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
1287DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
1288DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
1289DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
1290DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
1291DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
1292DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
1293DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
1294DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
1295DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
1296DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
1297DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
1298DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
1299DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
1300DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
1301DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
1302DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
1303DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
1304DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
1305DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
1306DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
1307DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
1308DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
1309DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
Keith Randall5a546962013-08-07 10:23:24 -07001310GLOBL shifts<>(SB),RODATA,$256
Keith Randall3d5daa22013-04-02 16:26:15 -07001311
Keith Randallbd70bd92016-02-22 13:20:38 -08001312// memequal(p, q unsafe.Pointer, size uintptr) bool
1313TEXT runtime·memequal(SB),NOSPLIT,$0-25
Keith Randall0c6b55e2014-07-16 14:16:19 -07001314 MOVQ a+0(FP), SI
1315 MOVQ b+8(FP), DI
Keith Randallbd70bd92016-02-22 13:20:38 -08001316 CMPQ SI, DI
1317 JEQ eq
Keith Randall0c6b55e2014-07-16 14:16:19 -07001318 MOVQ size+16(FP), BX
Keith Randallc526f3a2015-04-21 14:22:41 -07001319 LEAQ ret+24(FP), AX
1320 JMP runtime·memeqbody(SB)
Keith Randallbd70bd92016-02-22 13:20:38 -08001321eq:
1322 MOVB $1, ret+24(FP)
1323 RET
Keith Randall0c6b55e2014-07-16 14:16:19 -07001324
Keith Randalld5e4c402015-01-06 16:42:48 -08001325// memequal_varlen(a, b unsafe.Pointer) bool
1326TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
1327 MOVQ a+0(FP), SI
1328 MOVQ b+8(FP), DI
1329 CMPQ SI, DI
1330 JEQ eq
1331 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure
Keith Randallc526f3a2015-04-21 14:22:41 -07001332 LEAQ ret+16(FP), AX
1333 JMP runtime·memeqbody(SB)
Keith Randalld5e4c402015-01-06 16:42:48 -08001334eq:
1335 MOVB $1, ret+16(FP)
1336 RET
1337
Keith Randall3d5daa22013-04-02 16:26:15 -07001338// a in SI
1339// b in DI
1340// count in BX
Keith Randallc526f3a2015-04-21 14:22:41 -07001341// address of result byte in AX
Keith Randall5a546962013-08-07 10:23:24 -07001342TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
Keith Randall3d5daa22013-04-02 16:26:15 -07001343 CMPQ BX, $8
1344 JB small
Ilya Tocar967564b2015-10-29 17:17:05 +03001345 CMPQ BX, $64
1346 JB bigloop
1347 CMPB runtime·support_avx2(SB), $1
1348 JE hugeloop_avx2
Keith Randall3d5daa22013-04-02 16:26:15 -07001349
1350 // 64 bytes at a time using xmm registers
1351hugeloop:
1352 CMPQ BX, $64
1353 JB bigloop
1354 MOVOU (SI), X0
1355 MOVOU (DI), X1
1356 MOVOU 16(SI), X2
1357 MOVOU 16(DI), X3
1358 MOVOU 32(SI), X4
1359 MOVOU 32(DI), X5
1360 MOVOU 48(SI), X6
1361 MOVOU 48(DI), X7
1362 PCMPEQB X1, X0
1363 PCMPEQB X3, X2
1364 PCMPEQB X5, X4
1365 PCMPEQB X7, X6
1366 PAND X2, X0
1367 PAND X6, X4
1368 PAND X4, X0
1369 PMOVMSKB X0, DX
1370 ADDQ $64, SI
1371 ADDQ $64, DI
1372 SUBQ $64, BX
1373 CMPL DX, $0xffff
1374 JEQ hugeloop
Keith Randallc526f3a2015-04-21 14:22:41 -07001375 MOVB $0, (AX)
Keith Randall3d5daa22013-04-02 16:26:15 -07001376 RET
1377
Ilya Tocar967564b2015-10-29 17:17:05 +03001378 // 64 bytes at a time using ymm registers
1379hugeloop_avx2:
1380 CMPQ BX, $64
1381 JB bigloop_avx2
Russ Cox8d881b82016-01-22 22:25:15 -05001382 VMOVDQU (SI), Y0
1383 VMOVDQU (DI), Y1
1384 VMOVDQU 32(SI), Y2
1385 VMOVDQU 32(DI), Y3
1386 VPCMPEQB Y1, Y0, Y4
1387 VPCMPEQB Y2, Y3, Y5
1388 VPAND Y4, Y5, Y6
1389 VPMOVMSKB Y6, DX
Ilya Tocar967564b2015-10-29 17:17:05 +03001390 ADDQ $64, SI
1391 ADDQ $64, DI
1392 SUBQ $64, BX
1393 CMPL DX, $0xffffffff
1394 JEQ hugeloop_avx2
1395 VZEROUPPER
1396 MOVB $0, (AX)
1397 RET
1398
1399bigloop_avx2:
1400 VZEROUPPER
1401
Keith Randall3d5daa22013-04-02 16:26:15 -07001402 // 8 bytes at a time using 64-bit register
1403bigloop:
1404 CMPQ BX, $8
1405 JBE leftover
1406 MOVQ (SI), CX
1407 MOVQ (DI), DX
1408 ADDQ $8, SI
1409 ADDQ $8, DI
1410 SUBQ $8, BX
1411 CMPQ CX, DX
1412 JEQ bigloop
Keith Randallc526f3a2015-04-21 14:22:41 -07001413 MOVB $0, (AX)
Keith Randall3d5daa22013-04-02 16:26:15 -07001414 RET
1415
1416 // remaining 0-8 bytes
1417leftover:
1418 MOVQ -8(SI)(BX*1), CX
1419 MOVQ -8(DI)(BX*1), DX
1420 CMPQ CX, DX
Keith Randallc526f3a2015-04-21 14:22:41 -07001421 SETEQ (AX)
Keith Randall3d5daa22013-04-02 16:26:15 -07001422 RET
1423
1424small:
1425 CMPQ BX, $0
1426 JEQ equal
1427
1428 LEAQ 0(BX*8), CX
1429 NEGQ CX
1430
1431 CMPB SI, $0xf8
1432 JA si_high
1433
1434 // load at SI won't cross a page boundary.
1435 MOVQ (SI), SI
1436 JMP si_finish
1437si_high:
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +00001438 // address ends in 11111xxx. Load up to bytes we want, move to correct position.
Keith Randall3d5daa22013-04-02 16:26:15 -07001439 MOVQ -8(SI)(BX*1), SI
1440 SHRQ CX, SI
1441si_finish:
1442
1443 // same for DI.
1444 CMPB DI, $0xf8
1445 JA di_high
1446 MOVQ (DI), DI
1447 JMP di_finish
1448di_high:
1449 MOVQ -8(DI)(BX*1), DI
1450 SHRQ CX, DI
1451di_finish:
1452
1453 SUBQ SI, DI
1454 SHLQ CX, DI
1455equal:
Keith Randallc526f3a2015-04-21 14:22:41 -07001456 SETEQ (AX)
Keith Randall3d5daa22013-04-02 16:26:15 -07001457 RET
Keith Randallb3946dc2013-05-14 16:05:51 -07001458
Keith Randall5a546962013-08-07 10:23:24 -07001459TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
Russ Cox25f6b022014-08-27 11:32:17 -04001460 MOVQ s1_base+0(FP), SI
1461 MOVQ s1_len+8(FP), BX
1462 MOVQ s2_base+16(FP), DI
1463 MOVQ s2_len+24(FP), DX
Keith Randallc526f3a2015-04-21 14:22:41 -07001464 LEAQ ret+32(FP), R9
1465 JMP runtime·cmpbody(SB)
Keith Randallb3946dc2013-05-14 16:05:51 -07001466
Russ Cox7a524a12014-12-22 13:27:53 -05001467TEXT bytes·Compare(SB),NOSPLIT,$0-56
Keith Randallb3946dc2013-05-14 16:05:51 -07001468 MOVQ s1+0(FP), SI
1469 MOVQ s1+8(FP), BX
1470 MOVQ s2+24(FP), DI
1471 MOVQ s2+32(FP), DX
Keith Randallc526f3a2015-04-21 14:22:41 -07001472 LEAQ res+48(FP), R9
1473 JMP runtime·cmpbody(SB)
Keith Randallb3946dc2013-05-14 16:05:51 -07001474
1475// input:
1476// SI = a
1477// DI = b
1478// BX = alen
1479// DX = blen
Keith Randallc526f3a2015-04-21 14:22:41 -07001480// R9 = address of output word (stores -1/0/1 here)
Keith Randall5a546962013-08-07 10:23:24 -07001481TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
Keith Randallb3946dc2013-05-14 16:05:51 -07001482 CMPQ SI, DI
Russ Coxb55791e2014-10-28 21:50:16 -04001483 JEQ allsame
Keith Randallb3946dc2013-05-14 16:05:51 -07001484 CMPQ BX, DX
Austin Clements20a6ff72015-01-27 18:29:02 -05001485 MOVQ DX, R8
1486 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
1487 CMPQ R8, $8
Russ Coxb55791e2014-10-28 21:50:16 -04001488 JB small
Keith Randallb3946dc2013-05-14 16:05:51 -07001489
Uttam C Pawar32add8d2015-07-02 11:43:46 -07001490 CMPQ R8, $63
Ilya Tocar0e23ca42015-10-28 23:20:26 +03001491 JBE loop
1492 CMPB runtime·support_avx2(SB), $1
1493 JEQ big_loop_avx2
1494 JMP big_loop
Russ Coxb55791e2014-10-28 21:50:16 -04001495loop:
Austin Clements20a6ff72015-01-27 18:29:02 -05001496 CMPQ R8, $16
Russ Coxb55791e2014-10-28 21:50:16 -04001497 JBE _0through16
Keith Randallb3946dc2013-05-14 16:05:51 -07001498 MOVOU (SI), X0
1499 MOVOU (DI), X1
1500 PCMPEQB X0, X1
1501 PMOVMSKB X1, AX
1502 XORQ $0xffff, AX // convert EQ to NE
Russ Coxb55791e2014-10-28 21:50:16 -04001503 JNE diff16 // branch if at least one byte is not equal
Keith Randallb3946dc2013-05-14 16:05:51 -07001504 ADDQ $16, SI
1505 ADDQ $16, DI
Austin Clements20a6ff72015-01-27 18:29:02 -05001506 SUBQ $16, R8
Russ Coxb55791e2014-10-28 21:50:16 -04001507 JMP loop
Keith Randallb3946dc2013-05-14 16:05:51 -07001508
Uttam C Pawar32add8d2015-07-02 11:43:46 -07001509diff64:
1510 ADDQ $48, SI
1511 ADDQ $48, DI
1512 JMP diff16
1513diff48:
1514 ADDQ $32, SI
1515 ADDQ $32, DI
1516 JMP diff16
1517diff32:
1518 ADDQ $16, SI
1519 ADDQ $16, DI
Keith Randallb3946dc2013-05-14 16:05:51 -07001520 // AX = bit mask of differences
Russ Coxb55791e2014-10-28 21:50:16 -04001521diff16:
Keith Randallb3946dc2013-05-14 16:05:51 -07001522 BSFQ AX, BX // index of first byte that differs
1523 XORQ AX, AX
1524 MOVB (SI)(BX*1), CX
1525 CMPB CX, (DI)(BX*1)
1526 SETHI AX
1527 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
Keith Randallc526f3a2015-04-21 14:22:41 -07001528 MOVQ AX, (R9)
Keith Randallb3946dc2013-05-14 16:05:51 -07001529 RET
1530
1531 // 0 through 16 bytes left, alen>=8, blen>=8
Russ Coxb55791e2014-10-28 21:50:16 -04001532_0through16:
Austin Clements20a6ff72015-01-27 18:29:02 -05001533 CMPQ R8, $8
Russ Coxb55791e2014-10-28 21:50:16 -04001534 JBE _0through8
Keith Randallb3946dc2013-05-14 16:05:51 -07001535 MOVQ (SI), AX
1536 MOVQ (DI), CX
1537 CMPQ AX, CX
Russ Coxb55791e2014-10-28 21:50:16 -04001538 JNE diff8
1539_0through8:
Austin Clements20a6ff72015-01-27 18:29:02 -05001540 MOVQ -8(SI)(R8*1), AX
1541 MOVQ -8(DI)(R8*1), CX
Keith Randallb3946dc2013-05-14 16:05:51 -07001542 CMPQ AX, CX
Russ Coxb55791e2014-10-28 21:50:16 -04001543 JEQ allsame
Keith Randallb3946dc2013-05-14 16:05:51 -07001544
1545 // AX and CX contain parts of a and b that differ.
Russ Coxb55791e2014-10-28 21:50:16 -04001546diff8:
Keith Randallb3946dc2013-05-14 16:05:51 -07001547 BSWAPQ AX // reverse order of bytes
1548 BSWAPQ CX
1549 XORQ AX, CX
1550 BSRQ CX, CX // index of highest bit difference
1551 SHRQ CX, AX // move a's bit to bottom
1552 ANDQ $1, AX // mask bit
1553 LEAQ -1(AX*2), AX // 1/0 => +1/-1
Keith Randallc526f3a2015-04-21 14:22:41 -07001554 MOVQ AX, (R9)
Keith Randallb3946dc2013-05-14 16:05:51 -07001555 RET
1556
1557 // 0-7 bytes in common
Russ Coxb55791e2014-10-28 21:50:16 -04001558small:
Austin Clements20a6ff72015-01-27 18:29:02 -05001559 LEAQ (R8*8), CX // bytes left -> bits left
Keith Randallb3946dc2013-05-14 16:05:51 -07001560 NEGQ CX // - bits lift (== 64 - bits left mod 64)
Russ Coxb55791e2014-10-28 21:50:16 -04001561 JEQ allsame
Keith Randallb3946dc2013-05-14 16:05:51 -07001562
1563 // load bytes of a into high bytes of AX
1564 CMPB SI, $0xf8
Russ Coxb55791e2014-10-28 21:50:16 -04001565 JA si_high
Keith Randallb3946dc2013-05-14 16:05:51 -07001566 MOVQ (SI), SI
Russ Coxb55791e2014-10-28 21:50:16 -04001567 JMP si_finish
1568si_high:
Austin Clements20a6ff72015-01-27 18:29:02 -05001569 MOVQ -8(SI)(R8*1), SI
Keith Randallb3946dc2013-05-14 16:05:51 -07001570 SHRQ CX, SI
Russ Coxb55791e2014-10-28 21:50:16 -04001571si_finish:
Keith Randallb3946dc2013-05-14 16:05:51 -07001572 SHLQ CX, SI
1573
1574 // load bytes of b in to high bytes of BX
1575 CMPB DI, $0xf8
Russ Coxb55791e2014-10-28 21:50:16 -04001576 JA di_high
Keith Randallb3946dc2013-05-14 16:05:51 -07001577 MOVQ (DI), DI
Russ Coxb55791e2014-10-28 21:50:16 -04001578 JMP di_finish
1579di_high:
Austin Clements20a6ff72015-01-27 18:29:02 -05001580 MOVQ -8(DI)(R8*1), DI
Keith Randallb3946dc2013-05-14 16:05:51 -07001581 SHRQ CX, DI
Russ Coxb55791e2014-10-28 21:50:16 -04001582di_finish:
Keith Randallb3946dc2013-05-14 16:05:51 -07001583 SHLQ CX, DI
1584
1585 BSWAPQ SI // reverse order of bytes
1586 BSWAPQ DI
1587 XORQ SI, DI // find bit differences
Russ Coxb55791e2014-10-28 21:50:16 -04001588 JEQ allsame
Keith Randallb3946dc2013-05-14 16:05:51 -07001589 BSRQ DI, CX // index of highest bit difference
1590 SHRQ CX, SI // move a's bit to bottom
1591 ANDQ $1, SI // mask bit
1592 LEAQ -1(SI*2), AX // 1/0 => +1/-1
Keith Randallc526f3a2015-04-21 14:22:41 -07001593 MOVQ AX, (R9)
Keith Randallb3946dc2013-05-14 16:05:51 -07001594 RET
1595
Russ Coxb55791e2014-10-28 21:50:16 -04001596allsame:
Keith Randallb3946dc2013-05-14 16:05:51 -07001597 XORQ AX, AX
1598 XORQ CX, CX
1599 CMPQ BX, DX
1600 SETGT AX // 1 if alen > blen
1601 SETEQ CX // 1 if alen == blen
1602 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
Keith Randallc526f3a2015-04-21 14:22:41 -07001603 MOVQ AX, (R9)
Keith Randallb3946dc2013-05-14 16:05:51 -07001604 RET
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07001605
Uttam C Pawar32add8d2015-07-02 11:43:46 -07001606 // this works for >= 64 bytes of data.
1607big_loop:
1608 MOVOU (SI), X0
1609 MOVOU (DI), X1
1610 PCMPEQB X0, X1
1611 PMOVMSKB X1, AX
1612 XORQ $0xffff, AX
1613 JNE diff16
1614
1615 MOVOU 16(SI), X0
1616 MOVOU 16(DI), X1
1617 PCMPEQB X0, X1
1618 PMOVMSKB X1, AX
1619 XORQ $0xffff, AX
1620 JNE diff32
1621
1622 MOVOU 32(SI), X0
1623 MOVOU 32(DI), X1
1624 PCMPEQB X0, X1
1625 PMOVMSKB X1, AX
1626 XORQ $0xffff, AX
1627 JNE diff48
1628
1629 MOVOU 48(SI), X0
1630 MOVOU 48(DI), X1
1631 PCMPEQB X0, X1
1632 PMOVMSKB X1, AX
1633 XORQ $0xffff, AX
1634 JNE diff64
1635
1636 ADDQ $64, SI
1637 ADDQ $64, DI
1638 SUBQ $64, R8
1639 CMPQ R8, $64
1640 JBE loop
1641 JMP big_loop
1642
Ilya Tocar0e23ca42015-10-28 23:20:26 +03001643 // Compare 64-bytes per loop iteration.
1644 // Loop is unrolled and uses AVX2.
1645big_loop_avx2:
Russ Cox8d881b82016-01-22 22:25:15 -05001646 VMOVDQU (SI), Y2
1647 VMOVDQU (DI), Y3
1648 VMOVDQU 32(SI), Y4
1649 VMOVDQU 32(DI), Y5
1650 VPCMPEQB Y2, Y3, Y0
1651 VPMOVMSKB Y0, AX
Ilya Tocar0e23ca42015-10-28 23:20:26 +03001652 XORL $0xffffffff, AX
1653 JNE diff32_avx2
Russ Cox8d881b82016-01-22 22:25:15 -05001654 VPCMPEQB Y4, Y5, Y6
1655 VPMOVMSKB Y6, AX
Ilya Tocar0e23ca42015-10-28 23:20:26 +03001656 XORL $0xffffffff, AX
1657 JNE diff64_avx2
1658
1659 ADDQ $64, SI
1660 ADDQ $64, DI
1661 SUBQ $64, R8
1662 CMPQ R8, $64
1663 JB big_loop_avx2_exit
1664 JMP big_loop_avx2
1665
1666 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
1667diff32_avx2:
1668 VZEROUPPER
1669 JMP diff16
1670
1671 // Same as diff32_avx2, but for last 32 bytes.
1672diff64_avx2:
1673 VZEROUPPER
1674 JMP diff48
1675
1676 // For <64 bytes remainder jump to normal loop.
1677big_loop_avx2_exit:
1678 VZEROUPPER
1679 JMP loop
1680
Ilya Tocar95333ae2015-10-28 18:05:05 +03001681TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
1682 MOVQ s+0(FP), DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001683 // We want len in DX and AX, because PCMPESTRI implicitly consumes them
1684 MOVQ s_len+8(FP), DX
1685 MOVQ c+16(FP), BP
1686 MOVQ c_len+24(FP), AX
Ilya Tocar44f18542016-04-28 17:34:24 +03001687 MOVQ DI, R10
1688 LEAQ ret+32(FP), R11
1689 JMP runtime·indexShortStr(SB)
1690
1691TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
1692 MOVQ s+0(FP), DI
1693 MOVQ s_len+8(FP), DX
1694 MOVQ c+24(FP), BP
1695 MOVQ c_len+32(FP), AX
1696 MOVQ DI, R10
1697 LEAQ ret+48(FP), R11
1698 JMP runtime·indexShortStr(SB)
1699
1700// AX: length of string, that we are searching for
1701// DX: length of string, in which we are searching
1702// DI: pointer to string, in which we are searching
1703// BP: pointer to string, that we are searching for
1704// R11: address, where to put return value
1705TEXT runtime·indexShortStr(SB),NOSPLIT,$0
Ilya Tocar6b02a192016-04-21 18:24:12 +03001706 CMPQ AX, DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001707 JA fail
Ilya Tocar6b02a192016-04-21 18:24:12 +03001708 CMPQ DX, $16
1709 JAE sse42
1710no_sse42:
1711 CMPQ AX, $2
Ilya Tocar95333ae2015-10-28 18:05:05 +03001712 JA _3_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001713 MOVW (BP), BP
1714 LEAQ -1(DI)(DX*1), DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001715loop2:
1716 MOVW (DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001717 CMPW SI,BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001718 JZ success
1719 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001720 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001721 JB loop2
1722 JMP fail
1723_3_or_more:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001724 CMPQ AX, $3
Ilya Tocar95333ae2015-10-28 18:05:05 +03001725 JA _4_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001726 MOVW 1(BP), BX
1727 MOVW (BP), BP
1728 LEAQ -2(DI)(DX*1), DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001729loop3:
1730 MOVW (DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001731 CMPW SI,BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001732 JZ partial_success3
1733 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001734 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001735 JB loop3
1736 JMP fail
1737partial_success3:
1738 MOVW 1(DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001739 CMPW SI,BX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001740 JZ success
1741 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001742 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001743 JB loop3
1744 JMP fail
1745_4_or_more:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001746 CMPQ AX, $4
Ilya Tocar95333ae2015-10-28 18:05:05 +03001747 JA _5_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001748 MOVL (BP), BP
1749 LEAQ -3(DI)(DX*1), DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001750loop4:
1751 MOVL (DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001752 CMPL SI,BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001753 JZ success
1754 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001755 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001756 JB loop4
1757 JMP fail
1758_5_or_more:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001759 CMPQ AX, $7
Ilya Tocar95333ae2015-10-28 18:05:05 +03001760 JA _8_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001761 LEAQ 1(DI)(DX*1), DX
1762 SUBQ AX, DX
1763 MOVL -4(BP)(AX*1), BX
1764 MOVL (BP), BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001765loop5to7:
1766 MOVL (DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001767 CMPL SI,BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001768 JZ partial_success5to7
1769 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001770 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001771 JB loop5to7
1772 JMP fail
1773partial_success5to7:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001774 MOVL -4(AX)(DI*1), SI
1775 CMPL SI,BX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001776 JZ success
1777 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001778 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001779 JB loop5to7
1780 JMP fail
1781_8_or_more:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001782 CMPQ AX, $8
Ilya Tocar95333ae2015-10-28 18:05:05 +03001783 JA _9_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001784 MOVQ (BP), BP
1785 LEAQ -7(DI)(DX*1), DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001786loop8:
1787 MOVQ (DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001788 CMPQ SI,BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001789 JZ success
1790 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001791 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001792 JB loop8
1793 JMP fail
1794_9_or_more:
Ilya Tocar0cff2192016-04-28 17:39:55 +03001795 CMPQ AX, $15
Ilya Tocar95333ae2015-10-28 18:05:05 +03001796 JA _16_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001797 LEAQ 1(DI)(DX*1), DX
1798 SUBQ AX, DX
1799 MOVQ -8(BP)(AX*1), BX
1800 MOVQ (BP), BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001801loop9to15:
1802 MOVQ (DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001803 CMPQ SI,BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001804 JZ partial_success9to15
1805 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001806 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001807 JB loop9to15
1808 JMP fail
1809partial_success9to15:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001810 MOVQ -8(AX)(DI*1), SI
1811 CMPQ SI,BX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001812 JZ success
1813 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001814 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001815 JB loop9to15
1816 JMP fail
1817_16_or_more:
Ilya Tocar429bbf32016-05-25 16:33:19 +03001818 CMPQ AX, $16
Ilya Tocar0cff2192016-04-28 17:39:55 +03001819 JA _17_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001820 MOVOU (BP), X1
1821 LEAQ -15(DI)(DX*1), DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001822loop16:
1823 MOVOU (DI), X2
1824 PCMPEQB X1, X2
1825 PMOVMSKB X2, SI
1826 CMPQ SI, $0xffff
1827 JE success
1828 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001829 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001830 JB loop16
1831 JMP fail
Ilya Tocar0cff2192016-04-28 17:39:55 +03001832_17_or_more:
1833 CMPQ AX, $31
1834 JA _32_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001835 LEAQ 1(DI)(DX*1), DX
1836 SUBQ AX, DX
1837 MOVOU -16(BP)(AX*1), X0
1838 MOVOU (BP), X1
Ilya Tocar95333ae2015-10-28 18:05:05 +03001839loop17to31:
1840 MOVOU (DI), X2
1841 PCMPEQB X1,X2
1842 PMOVMSKB X2, SI
1843 CMPQ SI, $0xffff
1844 JE partial_success17to31
1845 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001846 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001847 JB loop17to31
1848 JMP fail
1849partial_success17to31:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001850 MOVOU -16(AX)(DI*1), X3
Ilya Tocar95333ae2015-10-28 18:05:05 +03001851 PCMPEQB X0, X3
1852 PMOVMSKB X3, SI
1853 CMPQ SI, $0xffff
1854 JE success
1855 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001856 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001857 JB loop17to31
Ilya Tocar0cff2192016-04-28 17:39:55 +03001858 JMP fail
1859// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
1860// So no need to check cpuid
1861_32_or_more:
1862 CMPQ AX, $32
1863 JA _33_to_63
1864 VMOVDQU (BP), Y1
1865 LEAQ -31(DI)(DX*1), DX
1866loop32:
1867 VMOVDQU (DI), Y2
1868 VPCMPEQB Y1, Y2, Y3
1869 VPMOVMSKB Y3, SI
1870 CMPL SI, $0xffffffff
1871 JE success_avx2
1872 ADDQ $1,DI
1873 CMPQ DI,DX
1874 JB loop32
1875 JMP fail_avx2
1876_33_to_63:
1877 LEAQ 1(DI)(DX*1), DX
1878 SUBQ AX, DX
1879 VMOVDQU -32(BP)(AX*1), Y0
1880 VMOVDQU (BP), Y1
1881loop33to63:
1882 VMOVDQU (DI), Y2
1883 VPCMPEQB Y1, Y2, Y3
1884 VPMOVMSKB Y3, SI
1885 CMPL SI, $0xffffffff
1886 JE partial_success33to63
1887 ADDQ $1,DI
1888 CMPQ DI,DX
1889 JB loop33to63
1890 JMP fail_avx2
1891partial_success33to63:
1892 VMOVDQU -32(AX)(DI*1), Y3
1893 VPCMPEQB Y0, Y3, Y4
1894 VPMOVMSKB Y4, SI
1895 CMPL SI, $0xffffffff
1896 JE success_avx2
1897 ADDQ $1,DI
1898 CMPQ DI,DX
1899 JB loop33to63
1900fail_avx2:
1901 VZEROUPPER
Ilya Tocar95333ae2015-10-28 18:05:05 +03001902fail:
Ilya Tocar44f18542016-04-28 17:34:24 +03001903 MOVQ $-1, (R11)
Ilya Tocar95333ae2015-10-28 18:05:05 +03001904 RET
Ilya Tocar0cff2192016-04-28 17:39:55 +03001905success_avx2:
1906 VZEROUPPER
1907 JMP success
Ilya Tocar6b02a192016-04-21 18:24:12 +03001908sse42:
Martin Möhrmann5a6c5802017-04-27 08:30:27 +02001909 CMPB runtime·support_sse42(SB), $1
1910 JNE no_sse42
Ilya Tocar6b02a192016-04-21 18:24:12 +03001911 CMPQ AX, $12
1912 // PCMPESTRI is slower than normal compare,
1913 // so using it makes sense only if we advance 4+ bytes per compare
1914 // This value was determined experimentally and is the ~same
1915 // on Nehalem (first with SSE42) and Haswell.
1916 JAE _9_or_more
1917 LEAQ 16(BP), SI
1918 TESTW $0xff0, SI
1919 JEQ no_sse42
1920 MOVOU (BP), X1
1921 LEAQ -15(DI)(DX*1), SI
1922 MOVQ $16, R9
1923 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
1924loop_sse42:
1925 // 0x0c means: unsigned byte compare (bits 0,1 are 00)
1926 // for equality (bits 2,3 are 11)
1927 // result is not masked or inverted (bits 4,5 are 00)
1928 // and corresponds to first matching byte (bit 6 is 0)
1929 PCMPESTRI $0x0c, (DI), X1
1930 // CX == 16 means no match,
1931 // CX > R9 means partial match at the end of the string,
1932 // otherwise sep is at offset CX from X1 start
1933 CMPQ CX, R9
1934 JBE sse42_success
1935 ADDQ R9, DI
1936 CMPQ DI, SI
1937 JB loop_sse42
1938 PCMPESTRI $0x0c, -1(SI), X1
1939 CMPQ CX, R9
1940 JA fail
1941 LEAQ -1(SI), DI
1942sse42_success:
1943 ADDQ CX, DI
Ilya Tocar95333ae2015-10-28 18:05:05 +03001944success:
Ilya Tocar44f18542016-04-28 17:34:24 +03001945 SUBQ R10, DI
1946 MOVQ DI, (R11)
Ilya Tocar95333ae2015-10-28 18:05:05 +03001947 RET
1948
1949
Shenghou Ma3b001972015-03-07 00:18:16 -05001950TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07001951 MOVQ s+0(FP), SI
1952 MOVQ s_len+8(FP), BX
1953 MOVB c+24(FP), AL
Keith Randallc526f3a2015-04-21 14:22:41 -07001954 LEAQ ret+32(FP), R8
1955 JMP runtime·indexbytebody(SB)
Brad Fitzpatrick598c7892013-08-05 15:04:05 -07001956
Shenghou Ma3b001972015-03-07 00:18:16 -05001957TEXT strings·IndexByte(SB),NOSPLIT,$0-32
Brad Fitzpatrick598c7892013-08-05 15:04:05 -07001958 MOVQ s+0(FP), SI
1959 MOVQ s_len+8(FP), BX
1960 MOVB c+16(FP), AL
Keith Randallc526f3a2015-04-21 14:22:41 -07001961 LEAQ ret+24(FP), R8
1962 JMP runtime·indexbytebody(SB)
Brad Fitzpatrick598c7892013-08-05 15:04:05 -07001963
1964// input:
1965// SI: data
1966// BX: data len
1967// AL: byte sought
Keith Randallc526f3a2015-04-21 14:22:41 -07001968// R8: address to put result
Keith Randall5a546962013-08-07 10:23:24 -07001969TEXT runtime·indexbytebody(SB),NOSPLIT,$0
Keith Randall687abca2016-01-15 18:17:09 -08001970 // Shuffle X0 around so that each byte contains
1971 // the character we're looking for.
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07001972 MOVD AX, X0
1973 PUNPCKLBW X0, X0
1974 PUNPCKLBW X0, X0
1975 PSHUFL $0, X0, X0
Keith Randall687abca2016-01-15 18:17:09 -08001976
1977 CMPQ BX, $16
1978 JLT small
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07001979
Keith Randall687abca2016-01-15 18:17:09 -08001980 MOVQ SI, DI
1981
1982 CMPQ BX, $32
1983 JA avx2
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07001984sse:
Keith Randall687abca2016-01-15 18:17:09 -08001985 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
1986 JMP sseloopentry
1987
1988sseloop:
1989 // Move the next 16-byte chunk of the data into X1.
1990 MOVOU (DI), X1
1991 // Compare bytes in X0 to X1.
1992 PCMPEQB X0, X1
1993 // Take the top bit of each byte in X1 and put the result in DX.
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07001994 PMOVMSKB X1, DX
Keith Randall687abca2016-01-15 18:17:09 -08001995 // Find first set bit, if any.
1996 BSFL DX, DX
1997 JNZ ssesuccess
1998 // Advance to next block.
1999 ADDQ $16, DI
2000sseloopentry:
2001 CMPQ DI, AX
2002 JB sseloop
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002003
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +00002004 // Search the last 16-byte chunk. This chunk may overlap with the
Keith Randall687abca2016-01-15 18:17:09 -08002005 // chunks we've already searched, but that's ok.
2006 MOVQ AX, DI
2007 MOVOU (AX), X1
2008 PCMPEQB X0, X1
2009 PMOVMSKB X1, DX
2010 BSFL DX, DX
2011 JNZ ssesuccess
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002012
2013failure:
Keith Randallc526f3a2015-04-21 14:22:41 -07002014 MOVQ $-1, (R8)
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002015 RET
2016
Keith Randall687abca2016-01-15 18:17:09 -08002017// We've found a chunk containing the byte.
2018// The chunk was loaded from DI.
2019// The index of the matching byte in the chunk is DX.
2020// The start of the data is SI.
2021ssesuccess:
2022 SUBQ SI, DI // Compute offset of chunk within data.
2023 ADDQ DX, DI // Add offset of byte within chunk.
2024 MOVQ DI, (R8)
2025 RET
2026
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002027// handle for lengths < 16
Russ Coxb55791e2014-10-28 21:50:16 -04002028small:
Keith Randall687abca2016-01-15 18:17:09 -08002029 TESTQ BX, BX
2030 JEQ failure
2031
2032 // Check if we'll load across a page boundary.
2033 LEAQ 16(SI), AX
2034 TESTW $0xff0, AX
2035 JEQ endofpage
2036
2037 MOVOU (SI), X1 // Load data
2038 PCMPEQB X0, X1 // Compare target byte with each byte in data.
2039 PMOVMSKB X1, DX // Move result bits to integer register.
2040 BSFL DX, DX // Find first set bit.
2041 JZ failure // No set bit, failure.
2042 CMPL DX, BX
2043 JAE failure // Match is past end of data.
2044 MOVQ DX, (R8)
2045 RET
2046
2047endofpage:
2048 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
2049 PCMPEQB X0, X1 // Compare target byte with each byte in data.
2050 PMOVMSKB X1, DX // Move result bits to integer register.
2051 MOVL BX, CX
2052 SHLL CX, DX
2053 SHRL $16, DX // Shift desired bits down to bottom of register.
2054 BSFL DX, DX // Find first set bit.
2055 JZ failure // No set bit, failure.
2056 MOVQ DX, (R8)
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002057 RET
2058
Ilya Tocar321a4072015-10-29 18:52:22 +03002059avx2:
2060 CMPB runtime·support_avx2(SB), $1
Keith Randall687abca2016-01-15 18:17:09 -08002061 JNE sse
Ilya Tocar321a4072015-10-29 18:52:22 +03002062 MOVD AX, X0
2063 LEAQ -32(SI)(BX*1), R11
Russ Cox8d881b82016-01-22 22:25:15 -05002064 VPBROADCASTB X0, Y1
Ilya Tocar321a4072015-10-29 18:52:22 +03002065avx2_loop:
Russ Cox8d881b82016-01-22 22:25:15 -05002066 VMOVDQU (DI), Y2
2067 VPCMPEQB Y1, Y2, Y3
2068 VPTEST Y3, Y3
Ilya Tocar321a4072015-10-29 18:52:22 +03002069 JNZ avx2success
2070 ADDQ $32, DI
2071 CMPQ DI, R11
2072 JLT avx2_loop
2073 MOVQ R11, DI
Russ Cox8d881b82016-01-22 22:25:15 -05002074 VMOVDQU (DI), Y2
2075 VPCMPEQB Y1, Y2, Y3
2076 VPTEST Y3, Y3
Ilya Tocar321a4072015-10-29 18:52:22 +03002077 JNZ avx2success
2078 VZEROUPPER
2079 MOVQ $-1, (R8)
2080 RET
2081
2082avx2success:
Russ Cox8d881b82016-01-22 22:25:15 -05002083 VPMOVMSKB Y3, DX
Ilya Tocar321a4072015-10-29 18:52:22 +03002084 BSFL DX, DX
2085 SUBQ SI, DI
2086 ADDQ DI, DX
2087 MOVQ DX, (R8)
2088 VZEROUPPER
2089 RET
2090
Keith Randall5a546962013-08-07 10:23:24 -07002091TEXT bytes·Equal(SB),NOSPLIT,$0-49
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002092 MOVQ a_len+8(FP), BX
2093 MOVQ b_len+32(FP), CX
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002094 CMPQ BX, CX
2095 JNE eqret
2096 MOVQ a+0(FP), SI
2097 MOVQ b+24(FP), DI
Keith Randallc526f3a2015-04-21 14:22:41 -07002098 LEAQ ret+48(FP), AX
2099 JMP runtime·memeqbody(SB)
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002100eqret:
Keith Randallc526f3a2015-04-21 14:22:41 -07002101 MOVB $0, ret+48(FP)
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002102 RET
Keith Randall6c7cbf02014-04-01 12:51:02 -07002103
Josselin Costanzid206af12017-03-27 13:22:59 +02002104
2105TEXT bytes·countByte(SB),NOSPLIT,$0-40
2106 MOVQ s+0(FP), SI
2107 MOVQ s_len+8(FP), BX
2108 MOVB c+24(FP), AL
2109 LEAQ ret+32(FP), R8
2110 JMP runtime·countByte(SB)
2111
2112TEXT strings·countByte(SB),NOSPLIT,$0-32
2113 MOVQ s+0(FP), SI
2114 MOVQ s_len+8(FP), BX
2115 MOVB c+16(FP), AL
2116 LEAQ ret+24(FP), R8
2117 JMP runtime·countByte(SB)
2118
2119// input:
2120// SI: data
2121// BX: data len
2122// AL: byte sought
2123// R8: address to put result
2124// This requires the POPCNT instruction
2125TEXT runtime·countByte(SB),NOSPLIT,$0
2126 // Shuffle X0 around so that each byte contains
2127 // the character we're looking for.
2128 MOVD AX, X0
2129 PUNPCKLBW X0, X0
2130 PUNPCKLBW X0, X0
2131 PSHUFL $0, X0, X0
2132
2133 CMPQ BX, $16
2134 JLT small
2135
2136 MOVQ $0, R12 // Accumulator
2137
2138 MOVQ SI, DI
2139
2140 CMPQ BX, $32
2141 JA avx2
2142sse:
2143 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
2144 JMP sseloopentry
2145
2146sseloop:
2147 // Move the next 16-byte chunk of the data into X1.
2148 MOVOU (DI), X1
2149 // Compare bytes in X0 to X1.
2150 PCMPEQB X0, X1
2151 // Take the top bit of each byte in X1 and put the result in DX.
2152 PMOVMSKB X1, DX
2153 // Count number of matching bytes
2154 POPCNTL DX, DX
2155 // Accumulate into R12
2156 ADDQ DX, R12
2157 // Advance to next block.
2158 ADDQ $16, DI
2159sseloopentry:
2160 CMPQ DI, AX
2161 JBE sseloop
2162
2163 // Get the number of bytes to consider in the last 16 bytes
2164 ANDQ $15, BX
2165 JZ end
2166
2167 // Create mask to ignore overlap between previous 16 byte block
2168 // and the next.
2169 MOVQ $16,CX
2170 SUBQ BX, CX
2171 MOVQ $0xFFFF, R10
2172 SARQ CL, R10
2173 SALQ CL, R10
2174
2175 // Process the last 16-byte chunk. This chunk may overlap with the
2176 // chunks we've already searched so we need to mask part of it.
2177 MOVOU (AX), X1
2178 PCMPEQB X0, X1
2179 PMOVMSKB X1, DX
2180 // Apply mask
2181 ANDQ R10, DX
2182 POPCNTL DX, DX
2183 ADDQ DX, R12
2184end:
2185 MOVQ R12, (R8)
2186 RET
2187
2188// handle for lengths < 16
2189small:
2190 TESTQ BX, BX
2191 JEQ endzero
2192
2193 // Check if we'll load across a page boundary.
2194 LEAQ 16(SI), AX
2195 TESTW $0xff0, AX
2196 JEQ endofpage
2197
2198 // We must ignore high bytes as they aren't part of our slice.
2199 // Create mask.
2200 MOVB BX, CX
2201 MOVQ $1, R10
2202 SALQ CL, R10
2203 SUBQ $1, R10
2204
2205 // Load data
2206 MOVOU (SI), X1
2207 // Compare target byte with each byte in data.
2208 PCMPEQB X0, X1
2209 // Move result bits to integer register.
2210 PMOVMSKB X1, DX
2211 // Apply mask
2212 ANDQ R10, DX
2213 POPCNTL DX, DX
2214 // Directly return DX, we don't need to accumulate
2215 // since we have <16 bytes.
2216 MOVQ DX, (R8)
2217 RET
2218endzero:
2219 MOVQ $0, (R8)
2220 RET
2221
2222endofpage:
2223 // We must ignore low bytes as they aren't part of our slice.
2224 MOVQ $16,CX
2225 SUBQ BX, CX
2226 MOVQ $0xFFFF, R10
2227 SARQ CL, R10
2228 SALQ CL, R10
2229
2230 // Load data into the high end of X1.
2231 MOVOU -16(SI)(BX*1), X1
2232 // Compare target byte with each byte in data.
2233 PCMPEQB X0, X1
2234 // Move result bits to integer register.
2235 PMOVMSKB X1, DX
2236 // Apply mask
2237 ANDQ R10, DX
2238 // Directly return DX, we don't need to accumulate
2239 // since we have <16 bytes.
2240 POPCNTL DX, DX
2241 MOVQ DX, (R8)
2242 RET
2243
2244avx2:
2245 CMPB runtime·support_avx2(SB), $1
2246 JNE sse
2247 MOVD AX, X0
2248 LEAQ -32(SI)(BX*1), R11
2249 VPBROADCASTB X0, Y1
2250avx2_loop:
2251 VMOVDQU (DI), Y2
2252 VPCMPEQB Y1, Y2, Y3
2253 VPMOVMSKB Y3, DX
2254 POPCNTL DX, DX
2255 ADDQ DX, R12
2256 ADDQ $32, DI
2257 CMPQ DI, R11
2258 JLE avx2_loop
2259
2260 // If last block is already processed,
2261 // skip to the end.
2262 CMPQ DI, R11
2263 JEQ endavx
2264
2265 // Load address of the last 32 bytes.
2266 // There is an overlap with the previous block.
2267 MOVQ R11, DI
2268 VMOVDQU (DI), Y2
2269 VPCMPEQB Y1, Y2, Y3
2270 VPMOVMSKB Y3, DX
2271 // Exit AVX mode.
2272 VZEROUPPER
2273
2274 // Create mask to ignore overlap between previous 32 byte block
2275 // and the next.
2276 ANDQ $31, BX
2277 MOVQ $32,CX
2278 SUBQ BX, CX
2279 MOVQ $0xFFFFFFFF, R10
2280 SARQ CL, R10
2281 SALQ CL, R10
2282 // Apply mask
2283 ANDQ R10, DX
2284 POPCNTL DX, DX
2285 ADDQ DX, R12
2286 MOVQ R12, (R8)
2287 RET
2288endavx:
2289 // Exit AVX mode.
2290 VZEROUPPER
2291 MOVQ R12, (R8)
2292 RET
2293
Keith Randallf4407372014-09-03 08:49:43 -07002294TEXT runtime·return0(SB), NOSPLIT, $0
2295 MOVL $0, AX
2296 RET
Keith Randall1b6807b2014-09-25 07:59:01 -07002297
2298
2299// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
2300// Must obey the gcc calling convention.
Keith Randall1aa65fe2014-09-25 08:37:04 -07002301TEXT _cgo_topofstack(SB),NOSPLIT,$0
Keith Randall1b6807b2014-09-25 07:59:01 -07002302 get_tls(CX)
2303 MOVQ g(CX), AX
2304 MOVQ g_m(AX), AX
2305 MOVQ m_curg(AX), AX
2306 MOVQ (g_stack+stack_hi)(AX), AX
2307 RET
Russ Coxa5a07332014-10-29 20:37:44 -04002308
2309// The top-most function running on a goroutine
2310// returns to goexit+PCQuantum.
2311TEXT runtime·goexit(SB),NOSPLIT,$0-0
2312 BYTE $0x90 // NOP
2313 CALL runtime·goexit1(SB) // does not return
Dmitry Vyukov894024f2015-02-20 20:07:02 +03002314 // traceback from goexit1 must hit code range of goexit
2315 BYTE $0x90 // NOP
Russ Cox15ced2d2014-11-11 17:06:22 -05002316
Michael Hudson-Doylef616af22015-04-01 14:17:43 +13002317// This is called from .init_array and follows the platform, not Go, ABI.
Michael Hudson-Doylebe0cb922015-05-12 11:59:14 +12002318TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
2319 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save
Michael Hudson-Doylef616af22015-04-01 14:17:43 +13002320 MOVQ runtime·lastmoduledatap(SB), AX
2321 MOVQ DI, moduledata_next(AX)
2322 MOVQ DI, runtime·lastmoduledatap(SB)
Michael Hudson-Doylebe0cb922015-05-12 11:59:14 +12002323 POPQ R15
Michael Hudson-Doylef616af22015-04-01 14:17:43 +13002324 RET