blob: 82b7832ae3e1f46a2e8f0a676e3940ff3ef19590 [file] [log] [blame]
Rob Pike8e82a672008-06-30 11:50:36 -07001// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
Russ Cox15ced2d2014-11-11 17:06:22 -05005#include "go_asm.h"
6#include "go_tls.h"
Russ Cox9ddfb642013-07-16 16:24:09 -04007#include "funcdata.h"
Russ Coxcb040d52014-09-04 23:05:18 -04008#include "textflag.h"
Rob Pike8e82a672008-06-30 11:50:36 -07009
Ian Lance Taylorcf3f7712017-10-09 11:31:20 -070010// _rt0_amd64 is common startup code for most amd64 systems when using
11// internal linking. This is the entry point for the program from the
12// kernel for an ordinary -buildmode=exe program. The stack holds the
13// number of arguments and the C-style argv.
14TEXT _rt0_amd64(SB),NOSPLIT,$-8
15 MOVQ 0(SP), DI // argc
16 LEAQ 8(SP), SI // argv
17 JMP runtime·rt0_go(SB)
18
19// main is common startup code for most amd64 systems when using
20// external linking. The C startup code will call the symbol "main"
21// passing argc and argv in the usual C ABI registers DI and SI.
22TEXT main(SB),NOSPLIT,$-8
23 JMP runtime·rt0_go(SB)
24
Ian Lance Taylor30cb30e2017-10-10 15:11:05 -070025// _rt0_amd64_lib is common startup code for most amd64 systems when
26// using -buildmode=c-archive or -buildmode=c-shared. The linker will
27// arrange to invoke this function as a global constructor (for
28// c-archive) or when the shared library is loaded (for c-shared).
29// We expect argc and argv to be passed in the usual C ABI registers
30// DI and SI.
31TEXT _rt0_amd64_lib(SB),NOSPLIT,$0x50
32 // Align stack per ELF ABI requirements.
33 MOVQ SP, AX
34 ANDQ $~15, SP
35 // Save C ABI callee-saved registers, as caller may need them.
36 MOVQ BX, 0x10(SP)
37 MOVQ BP, 0x18(SP)
38 MOVQ R12, 0x20(SP)
39 MOVQ R13, 0x28(SP)
40 MOVQ R14, 0x30(SP)
41 MOVQ R15, 0x38(SP)
42 MOVQ AX, 0x40(SP)
43
44 MOVQ DI, _rt0_amd64_lib_argc<>(SB)
45 MOVQ SI, _rt0_amd64_lib_argv<>(SB)
46
47 // Synchronous initialization.
48 CALL runtime·libpreinit(SB)
49
50 // Create a new thread to finish Go runtime initialization.
51 MOVQ _cgo_sys_thread_create(SB), AX
52 TESTQ AX, AX
53 JZ nocgo
54 MOVQ $_rt0_amd64_lib_go(SB), DI
55 MOVQ $0, SI
56 CALL AX
57 JMP restore
58
59nocgo:
60 MOVQ $0x800000, 0(SP) // stacksize
61 MOVQ $_rt0_amd64_lib_go(SB), AX
62 MOVQ AX, 8(SP) // fn
63 CALL runtime·newosproc0(SB)
64
65restore:
66 MOVQ 0x10(SP), BX
67 MOVQ 0x18(SP), BP
68 MOVQ 0x20(SP), R12
69 MOVQ 0x28(SP), R13
70 MOVQ 0x30(SP), R14
71 MOVQ 0x38(SP), R15
72 MOVQ 0x40(SP), SP
73 RET
74
75// _rt0_amd64_lib_go initializes the Go runtime.
76// This is started in a separate thread by _rt0_amd64_lib.
77TEXT _rt0_amd64_lib_go(SB),NOSPLIT,$0
78 MOVQ _rt0_amd64_lib_argc<>(SB), DI
79 MOVQ _rt0_amd64_lib_argv<>(SB), SI
80 JMP runtime·rt0_go(SB)
81
82DATA _rt0_amd64_lib_argc<>(SB)/8, $0
83GLOBL _rt0_amd64_lib_argc<>(SB),NOPTR, $8
84DATA _rt0_amd64_lib_argv<>(SB)/8, $0
85GLOBL _rt0_amd64_lib_argv<>(SB),NOPTR, $8
86
Russ Cox7ba41e92014-09-03 11:11:16 -040087TEXT runtime·rt0_go(SB),NOSPLIT,$0
Rob Pike8e82a672008-06-30 11:50:36 -070088 // copy arguments forward on an even stack
Russ Cox36b414f2013-03-06 15:03:04 -050089 MOVQ DI, AX // argc
90 MOVQ SI, BX // argv
Rob Pike8e82a672008-06-30 11:50:36 -070091 SUBQ $(4*8+7), SP // 2args 2auto
Ian Lance Taylora4f8d362010-04-09 14:15:15 -070092 ANDQ $~15, SP
Rob Pike8e82a672008-06-30 11:50:36 -070093 MOVQ AX, 16(SP)
94 MOVQ BX, 24(SP)
Dmitriy Vyukov428062d2011-12-07 16:53:17 +030095
96 // create istack out of the given (operating system) stack.
Russ Coxf8d49b52013-02-28 16:24:38 -050097 // _cgo_init may update stackguard.
Dmitriy Vyukov428062d2011-12-07 16:53:17 +030098 MOVQ $runtime·g0(SB), DI
Alex Brainman8d6958f2012-01-20 12:59:44 +110099 LEAQ (-64*1024+104)(SP), BX
Russ Coxe6d35112015-01-05 16:29:21 +0000100 MOVQ BX, g_stackguard0(DI)
101 MOVQ BX, g_stackguard1(DI)
Russ Cox15b76ad2014-09-09 13:39:57 -0400102 MOVQ BX, (g_stack+stack_lo)(DI)
103 MOVQ SP, (g_stack+stack_hi)(DI)
Rob Pike8e82a672008-06-30 11:50:36 -0700104
Keith Randalla5d40242013-03-12 10:47:44 -0700105 // find out information about the processor we're on
Martin Möhrmann5a6c5802017-04-27 08:30:27 +0200106 MOVL $0, AX
Keith Randalla5d40242013-03-12 10:47:44 -0700107 CPUID
Martin Möhrmann5a6c5802017-04-27 08:30:27 +0200108 MOVL AX, SI
109 CMPL AX, $0
Keith Randalla5d40242013-03-12 10:47:44 -0700110 JE nocpuinfo
Dmitry Vyukov6e70fdd2015-02-17 14:25:49 +0300111
112 // Figure out how to serialize RDTSC.
113 // On Intel processors LFENCE is enough. AMD requires MFENCE.
114 // Don't know about the rest, so let's do MFENCE.
115 CMPL BX, $0x756E6547 // "Genu"
116 JNE notintel
117 CMPL DX, $0x49656E69 // "ineI"
118 JNE notintel
119 CMPL CX, $0x6C65746E // "ntel"
120 JNE notintel
Martin Möhrmannb64e8172017-04-24 16:59:33 +0200121 MOVB $1, runtime·isIntel(SB)
Dmitry Vyukov6e70fdd2015-02-17 14:25:49 +0300122 MOVB $1, runtime·lfenceBeforeRdtsc(SB)
123notintel:
124
Keith Randall4b209db2016-03-29 21:25:33 -0700125 // Load EAX=1 cpuid flags
Martin Möhrmann5a6c5802017-04-27 08:30:27 +0200126 MOVL $1, AX
Keith Randalla5d40242013-03-12 10:47:44 -0700127 CPUID
Martin Möhrmann5a6c5802017-04-27 08:30:27 +0200128 MOVL AX, runtime·processorVersionInfo(SB)
Keith Randall4b209db2016-03-29 21:25:33 -0700129
Martin Möhrmann5a6c5802017-04-27 08:30:27 +0200130 TESTL $(1<<26), DX // SSE2
131 SETNE runtime·support_sse2(SB)
132
133 TESTL $(1<<9), CX // SSSE3
134 SETNE runtime·support_ssse3(SB)
135
136 TESTL $(1<<19), CX // SSE4.1
137 SETNE runtime·support_sse41(SB)
138
139 TESTL $(1<<20), CX // SSE4.2
140 SETNE runtime·support_sse42(SB)
141
142 TESTL $(1<<23), CX // POPCNT
143 SETNE runtime·support_popcnt(SB)
144
145 TESTL $(1<<25), CX // AES
146 SETNE runtime·support_aes(SB)
147
148 TESTL $(1<<27), CX // OSXSAVE
149 SETNE runtime·support_osxsave(SB)
150
151 // If OS support for XMM and YMM is not present
152 // support_avx will be set back to false later.
153 TESTL $(1<<28), CX // AVX
154 SETNE runtime·support_avx(SB)
155
156eax7:
Keith Randall4b209db2016-03-29 21:25:33 -0700157 // Load EAX=7/ECX=0 cpuid flags
Martin Möhrmann5a6c5802017-04-27 08:30:27 +0200158 CMPL SI, $7
159 JLT osavx
Keith Randall4b209db2016-03-29 21:25:33 -0700160 MOVL $7, AX
161 MOVL $0, CX
162 CPUID
Martin Möhrmann5a6c5802017-04-27 08:30:27 +0200163
164 TESTL $(1<<3), BX // BMI1
165 SETNE runtime·support_bmi1(SB)
166
167 // If OS support for XMM and YMM is not present
168 // support_avx2 will be set back to false later.
169 TESTL $(1<<5), BX
170 SETNE runtime·support_avx2(SB)
171
172 TESTL $(1<<8), BX // BMI2
173 SETNE runtime·support_bmi2(SB)
174
175 TESTL $(1<<9), BX // ERMS
176 SETNE runtime·support_erms(SB)
177
178osavx:
179 CMPB runtime·support_osxsave(SB), $1
180 JNE noavx
181 MOVL $0, CX
Ilya Tocar0e23ca42015-10-28 23:20:26 +0300182 // For XGETBV, OSXSAVE bit is required and sufficient
Ilya Tocar1d1f2fb2016-01-13 16:43:22 +0300183 XGETBV
Martin Möhrmann5a6c5802017-04-27 08:30:27 +0200184 ANDL $6, AX
185 CMPL AX, $6 // Check for OS support of XMM and YMM registers.
186 JE nocpuinfo
Ilya Tocar0e23ca42015-10-28 23:20:26 +0300187noavx:
Martin Möhrmann5a6c5802017-04-27 08:30:27 +0200188 MOVB $0, runtime·support_avx(SB)
189 MOVB $0, runtime·support_avx2(SB)
190
191nocpuinfo:
Russ Coxf8d49b52013-02-28 16:24:38 -0500192 // if there is an _cgo_init, call it.
193 MOVQ _cgo_init(SB), AX
Ian Lance Taylora4f8d362010-04-09 14:15:15 -0700194 TESTQ AX, AX
Russ Coxe473f422010-08-04 17:50:22 -0700195 JZ needtls
Alex Brainman8d6958f2012-01-20 12:59:44 +1100196 // g0 already in DI
197 MOVQ DI, CX // Win64 uses CX for first parameter
Russ Cox89f185f2014-06-26 11:54:39 -0400198 MOVQ $setg_gcc<>(SB), SI
Alex Brainman8d6958f2012-01-20 12:59:44 +1100199 CALL AX
Russ Cox15b76ad2014-09-09 13:39:57 -0400200
Dmitriy Vyukovf5becf42013-06-03 12:28:24 +0400201 // update stackguard after _cgo_init
202 MOVQ $runtime·g0(SB), CX
Russ Cox15b76ad2014-09-09 13:39:57 -0400203 MOVQ (g_stack+stack_lo)(CX), AX
Russ Cox15ced2d2014-11-11 17:06:22 -0500204 ADDQ $const__StackGuard, AX
Russ Coxe6d35112015-01-05 16:29:21 +0000205 MOVQ AX, g_stackguard0(CX)
206 MOVQ AX, g_stackguard1(CX)
Russ Cox15b76ad2014-09-09 13:39:57 -0400207
Matthew Dempsky8ee0fd82015-06-09 15:24:38 -0700208#ifndef GOOS_windows
209 JMP ok
210#endif
Russ Coxe473f422010-08-04 17:50:22 -0700211needtls:
Matthew Dempsky8ee0fd82015-06-09 15:24:38 -0700212#ifdef GOOS_plan9
Akshat Kumara72bebf2012-08-31 13:21:13 -0400213 // skip TLS setup on Plan 9
Matthew Dempsky8ee0fd82015-06-09 15:24:38 -0700214 JMP ok
215#endif
216#ifdef GOOS_solaris
Aram Hăvărneanua46b4342014-01-17 17:58:10 +1300217 // skip TLS setup on Solaris
Matthew Dempsky8ee0fd82015-06-09 15:24:38 -0700218 JMP ok
219#endif
Akshat Kumara72bebf2012-08-31 13:21:13 -0400220
Matthew Dempsky7bb38f62015-11-12 15:35:50 -0800221 LEAQ runtime·m0+m_tls(SB), DI
Russ Cox68b42552010-11-04 14:00:19 -0400222 CALL runtime·settls(SB)
Russ Coxe473f422010-08-04 17:50:22 -0700223
224 // store through it, to make sure it works
225 get_tls(BX)
226 MOVQ $0x123, g(BX)
Matthew Dempsky7bb38f62015-11-12 15:35:50 -0800227 MOVQ runtime·m0+m_tls(SB), AX
Russ Coxe473f422010-08-04 17:50:22 -0700228 CMPQ AX, $0x123
229 JEQ 2(PC)
230 MOVL AX, 0 // abort
231ok:
232 // set the per-goroutine and per-mach "registers"
233 get_tls(BX)
Russ Cox68b42552010-11-04 14:00:19 -0400234 LEAQ runtime·g0(SB), CX
Russ Coxe473f422010-08-04 17:50:22 -0700235 MOVQ CX, g(BX)
Russ Cox68b42552010-11-04 14:00:19 -0400236 LEAQ runtime·m0(SB), AX
Russ Coxe473f422010-08-04 17:50:22 -0700237
238 // save m->g0 = g0
239 MOVQ CX, m_g0(AX)
Russ Cox89f185f2014-06-26 11:54:39 -0400240 // save m0 to g0->m
241 MOVQ AX, g_m(CX)
Rob Pike8e82a672008-06-30 11:50:36 -0700242
Ken Thompson8f53bc02008-12-15 15:07:35 -0800243 CLD // convention is D is always left cleared
Russ Cox68b42552010-11-04 14:00:19 -0400244 CALL runtime·check(SB)
Rob Pike8e82a672008-06-30 11:50:36 -0700245
Rob Pike8e82a672008-06-30 11:50:36 -0700246 MOVL 16(SP), AX // copy argc
247 MOVL AX, 0(SP)
248 MOVQ 24(SP), AX // copy argv
249 MOVQ AX, 8(SP)
Russ Cox68b42552010-11-04 14:00:19 -0400250 CALL runtime·args(SB)
251 CALL runtime·osinit(SB)
252 CALL runtime·schedinit(SB)
Russ Coxf7f63292008-08-05 14:21:42 -0700253
Ken Thompson751ce3a2008-07-11 19:16:39 -0700254 // create a new goroutine to start program
Michael Hudson-Doylef78dc1d2015-03-29 23:38:20 +0000255 MOVQ $runtime·mainPC(SB), AX // entry
Austin Clements20a6ff72015-01-27 18:29:02 -0500256 PUSHQ AX
Russ Cox7343e032009-06-17 15:12:16 -0700257 PUSHQ $0 // arg size
Russ Cox68b42552010-11-04 14:00:19 -0400258 CALL runtime·newproc(SB)
Russ Coxebd1eef2008-09-22 13:47:59 -0700259 POPQ AX
260 POPQ AX
Russ Cox79e1db22008-12-04 08:30:54 -0800261
Russ Coxebd1eef2008-09-22 13:47:59 -0700262 // start this M
Russ Cox68b42552010-11-04 14:00:19 -0400263 CALL runtime·mstart(SB)
Rob Pike8e82a672008-06-30 11:50:36 -0700264
Russ Cox36aa7d42012-03-08 14:03:56 -0500265 MOVL $0xf1, 0xf1 // crash
Rob Pike8e82a672008-06-30 11:50:36 -0700266 RET
267
Michael Hudson-Doylef78dc1d2015-03-29 23:38:20 +0000268DATA runtime·mainPC+0(SB)/8,$runtime·main(SB)
269GLOBL runtime·mainPC(SB),RODATA,$8
Russ Cox1903ad72013-02-21 17:01:13 -0500270
Keith Randall5a546962013-08-07 10:23:24 -0700271TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
Ken Thompson751ce3a2008-07-11 19:16:39 -0700272 BYTE $0xcc
Rob Pike8e82a672008-06-30 11:50:36 -0700273 RET
274
Keith Randall5a546962013-08-07 10:23:24 -0700275TEXT runtime·asminit(SB),NOSPLIT,$0-0
Russ Cox1707a992012-02-14 01:23:15 -0500276 // No per-thread init.
277 RET
278
Ken Thompson751ce3a2008-07-11 19:16:39 -0700279/*
280 * go-routine
281 */
Rob Piked3204ef2008-06-30 14:39:47 -0700282
Russ Coxf9ca3b52011-03-07 10:37:42 -0500283// void gosave(Gobuf*)
Russ Cox7343e032009-06-17 15:12:16 -0700284// save state in Gobuf; setjmp
Keith Randall5a546962013-08-07 10:23:24 -0700285TEXT runtime·gosave(SB), NOSPLIT, $0-8
Russ Cox25f6b022014-08-27 11:32:17 -0400286 MOVQ buf+0(FP), AX // gobuf
287 LEAQ buf+0(FP), BX // caller's SP
Russ Cox7343e032009-06-17 15:12:16 -0700288 MOVQ BX, gobuf_sp(AX)
289 MOVQ 0(SP), BX // caller's PC
290 MOVQ BX, gobuf_pc(AX)
Russ Coxd67e7e32013-06-12 15:22:26 -0400291 MOVQ $0, gobuf_ret(AX)
Austin Clements3c0fee12015-01-14 11:09:50 -0500292 MOVQ BP, gobuf_bp(AX)
Austin Clements70c107c2016-10-19 15:49:31 -0400293 // Assert ctxt is zero. See func save.
294 MOVQ gobuf_ctxt(AX), BX
295 TESTQ BX, BX
296 JZ 2(PC)
297 CALL runtime·badctxt(SB)
Russ Coxe473f422010-08-04 17:50:22 -0700298 get_tls(CX)
299 MOVQ g(CX), BX
300 MOVQ BX, gobuf_g(AX)
Ken Thompson751ce3a2008-07-11 19:16:39 -0700301 RET
302
Ian Lance Taylor06272482013-06-12 15:05:10 -0700303// void gogo(Gobuf*)
Russ Cox7343e032009-06-17 15:12:16 -0700304// restore state from Gobuf; longjmp
Austin Clements70c107c2016-10-19 15:49:31 -0400305TEXT runtime·gogo(SB), NOSPLIT, $16-8
Russ Cox25f6b022014-08-27 11:32:17 -0400306 MOVQ buf+0(FP), BX // gobuf
Russ Coxe473f422010-08-04 17:50:22 -0700307 MOVQ gobuf_g(BX), DX
308 MOVQ 0(DX), CX // make sure g != nil
309 get_tls(CX)
310 MOVQ DX, g(CX)
Russ Cox7343e032009-06-17 15:12:16 -0700311 MOVQ gobuf_sp(BX), SP // restore SP
Russ Coxd67e7e32013-06-12 15:22:26 -0400312 MOVQ gobuf_ret(BX), AX
313 MOVQ gobuf_ctxt(BX), DX
Austin Clements3c0fee12015-01-14 11:09:50 -0500314 MOVQ gobuf_bp(BX), BP
Russ Coxd67e7e32013-06-12 15:22:26 -0400315 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector
316 MOVQ $0, gobuf_ret(BX)
317 MOVQ $0, gobuf_ctxt(BX)
Austin Clements3c0fee12015-01-14 11:09:50 -0500318 MOVQ $0, gobuf_bp(BX)
Russ Cox7343e032009-06-17 15:12:16 -0700319 MOVQ gobuf_pc(BX), BX
320 JMP BX
321
Russ Cox012ceed2014-09-03 11:35:22 -0400322// func mcall(fn func(*g))
Russ Coxf9ca3b52011-03-07 10:37:42 -0500323// Switch to m->g0's stack, call fn(g).
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +0000324// Fn must never return. It should gogo(&g->sched)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500325// to keep running g.
Keith Randall5a546962013-08-07 10:23:24 -0700326TEXT runtime·mcall(SB), NOSPLIT, $0-8
Russ Coxf9ca3b52011-03-07 10:37:42 -0500327 MOVQ fn+0(FP), DI
328
329 get_tls(CX)
Russ Cox528534c2013-06-05 07:16:53 -0400330 MOVQ g(CX), AX // save state in g->sched
Russ Coxf9ca3b52011-03-07 10:37:42 -0500331 MOVQ 0(SP), BX // caller's PC
332 MOVQ BX, (g_sched+gobuf_pc)(AX)
Russ Cox25f6b022014-08-27 11:32:17 -0400333 LEAQ fn+0(FP), BX // caller's SP
Russ Coxf9ca3b52011-03-07 10:37:42 -0500334 MOVQ BX, (g_sched+gobuf_sp)(AX)
335 MOVQ AX, (g_sched+gobuf_g)(AX)
Austin Clements3c0fee12015-01-14 11:09:50 -0500336 MOVQ BP, (g_sched+gobuf_bp)(AX)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500337
338 // switch to m->g0 & its stack, call fn
Russ Cox89f185f2014-06-26 11:54:39 -0400339 MOVQ g(CX), BX
340 MOVQ g_m(BX), BX
Russ Coxf9ca3b52011-03-07 10:37:42 -0500341 MOVQ m_g0(BX), SI
342 CMPQ SI, AX // if g == m->g0 call badmcall
Russ Cox9ddfb642013-07-16 16:24:09 -0400343 JNE 3(PC)
Keith Randall32b770b2013-08-29 15:53:34 -0700344 MOVQ $runtime·badmcall(SB), AX
345 JMP AX
Russ Coxf9ca3b52011-03-07 10:37:42 -0500346 MOVQ SI, g(CX) // g = m->g0
Russ Cox528534c2013-06-05 07:16:53 -0400347 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp
Russ Coxf9ca3b52011-03-07 10:37:42 -0500348 PUSHQ AX
Russ Cox012ceed2014-09-03 11:35:22 -0400349 MOVQ DI, DX
350 MOVQ 0(DI), DI
Russ Coxf9ca3b52011-03-07 10:37:42 -0500351 CALL DI
352 POPQ AX
Keith Randall32b770b2013-08-29 15:53:34 -0700353 MOVQ $runtime·badmcall2(SB), AX
354 JMP AX
Russ Coxf9ca3b52011-03-07 10:37:42 -0500355 RET
356
Russ Cox656be312014-11-12 14:54:31 -0500357// systemstack_switch is a dummy routine that systemstack leaves at the bottom
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +0000358// of the G stack. We need to distinguish the routine that
Keith Randall4aa50432014-07-30 09:01:52 -0700359// lives at the bottom of the G stack from the one that lives
Russ Cox656be312014-11-12 14:54:31 -0500360// at the top of the system stack because the one at the top of
361// the system stack terminates the stack walk (see topofstack()).
362TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
Keith Randall4aa50432014-07-30 09:01:52 -0700363 RET
364
Russ Cox656be312014-11-12 14:54:31 -0500365// func systemstack(fn func())
366TEXT runtime·systemstack(SB), NOSPLIT, $0-8
367 MOVQ fn+0(FP), DI // DI = fn
Russ Cox1d550b82014-09-11 12:08:30 -0400368 get_tls(CX)
369 MOVQ g(CX), AX // AX = g
370 MOVQ g_m(AX), BX // BX = m
Russ Cox656be312014-11-12 14:54:31 -0500371
Russ Cox1d550b82014-09-11 12:08:30 -0400372 MOVQ m_gsignal(BX), DX // DX = gsignal
373 CMPQ AX, DX
Russ Cox656be312014-11-12 14:54:31 -0500374 JEQ noswitch
Russ Cox32ecf572014-09-04 00:10:10 -0400375
Keith Randall4aa50432014-07-30 09:01:52 -0700376 MOVQ m_g0(BX), DX // DX = g0
377 CMPQ AX, DX
Russ Cox656be312014-11-12 14:54:31 -0500378 JEQ noswitch
Keith Randall4aa50432014-07-30 09:01:52 -0700379
Austin Clements20a6ff72015-01-27 18:29:02 -0500380 MOVQ m_curg(BX), R8
381 CMPQ AX, R8
Russ Cox656be312014-11-12 14:54:31 -0500382 JEQ switch
Russ Cox32ecf572014-09-04 00:10:10 -0400383
Russ Cox656be312014-11-12 14:54:31 -0500384 // Bad: g is not gsignal, not g0, not curg. What is it?
385 MOVQ $runtime·badsystemstack(SB), AX
Russ Cox32ecf572014-09-04 00:10:10 -0400386 CALL AX
387
Russ Cox656be312014-11-12 14:54:31 -0500388switch:
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +0000389 // save our state in g->sched. Pretend to
Russ Cox656be312014-11-12 14:54:31 -0500390 // be systemstack_switch if the G stack is scanned.
Austin Clements20a6ff72015-01-27 18:29:02 -0500391 MOVQ $runtime·systemstack_switch(SB), SI
392 MOVQ SI, (g_sched+gobuf_pc)(AX)
Keith Randall4aa50432014-07-30 09:01:52 -0700393 MOVQ SP, (g_sched+gobuf_sp)(AX)
394 MOVQ AX, (g_sched+gobuf_g)(AX)
Austin Clements3c0fee12015-01-14 11:09:50 -0500395 MOVQ BP, (g_sched+gobuf_bp)(AX)
Keith Randall4aa50432014-07-30 09:01:52 -0700396
397 // switch to g0
398 MOVQ DX, g(CX)
Russ Coxd16a2ad2014-09-04 22:48:08 -0400399 MOVQ (g_sched+gobuf_sp)(DX), BX
Russ Cox656be312014-11-12 14:54:31 -0500400 // make it look like mstart called systemstack on g0, to stop traceback
Russ Coxd16a2ad2014-09-04 22:48:08 -0400401 SUBQ $8, BX
402 MOVQ $runtime·mstart(SB), DX
403 MOVQ DX, 0(BX)
404 MOVQ BX, SP
Keith Randall4aa50432014-07-30 09:01:52 -0700405
406 // call target function
Russ Cox012ceed2014-09-03 11:35:22 -0400407 MOVQ DI, DX
408 MOVQ 0(DI), DI
Keith Randall4aa50432014-07-30 09:01:52 -0700409 CALL DI
410
411 // switch back to g
412 get_tls(CX)
413 MOVQ g(CX), AX
414 MOVQ g_m(AX), BX
415 MOVQ m_curg(BX), AX
416 MOVQ AX, g(CX)
417 MOVQ (g_sched+gobuf_sp)(AX), SP
418 MOVQ $0, (g_sched+gobuf_sp)(AX)
419 RET
420
Russ Cox656be312014-11-12 14:54:31 -0500421noswitch:
Austin Clements15d6ab62017-10-27 15:20:21 -0400422 // already on m stack; tail call the function
423 // Using a tail call here cleans up tracebacks since we won't stop
424 // at an intermediate systemstack.
Russ Cox012ceed2014-09-03 11:35:22 -0400425 MOVQ DI, DX
426 MOVQ 0(DI), DI
Austin Clements15d6ab62017-10-27 15:20:21 -0400427 JMP DI
Keith Randall4aa50432014-07-30 09:01:52 -0700428
Rob Pike2da97832008-07-12 11:30:53 -0700429/*
430 * support for morestack
431 */
432
Russ Cox7343e032009-06-17 15:12:16 -0700433// Called during function prolog when more stack is needed.
Russ Cox58f12ff2013-07-18 16:53:45 -0400434//
435// The traceback routines see morestack on a g0 as being
436// the top of a stack (for example, morestack calling newstack
437// calling the scheduler calling newm calling gc), so we must
438// record an argument size. For that purpose, it has no arguments.
Keith Randall5a546962013-08-07 10:23:24 -0700439TEXT runtime·morestack(SB),NOSPLIT,$0-0
Russ Coxe473f422010-08-04 17:50:22 -0700440 // Cannot grow scheduler stack (m->g0).
Anthony Martin2302b212014-09-10 06:25:05 -0700441 get_tls(CX)
Russ Cox15b76ad2014-09-09 13:39:57 -0400442 MOVQ g(CX), BX
443 MOVQ g_m(BX), BX
Russ Coxe473f422010-08-04 17:50:22 -0700444 MOVQ m_g0(BX), SI
445 CMPQ g(CX), SI
Austin Clements687d9d52016-10-13 10:44:57 -0400446 JNE 3(PC)
447 CALL runtime·badmorestackg0(SB)
Russ Coxe473f422010-08-04 17:50:22 -0700448 INT $3
449
Russ Coxf8f630f2014-09-05 16:51:45 -0400450 // Cannot grow signal stack (m->gsignal).
451 MOVQ m_gsignal(BX), SI
452 CMPQ g(CX), SI
Austin Clements687d9d52016-10-13 10:44:57 -0400453 JNE 3(PC)
454 CALL runtime·badmorestackgsignal(SB)
Russ Coxf8f630f2014-09-05 16:51:45 -0400455 INT $3
456
Russ Cox7343e032009-06-17 15:12:16 -0700457 // Called from f.
458 // Set m->morebuf to f's caller.
459 MOVQ 8(SP), AX // f's caller's PC
Russ Coxe473f422010-08-04 17:50:22 -0700460 MOVQ AX, (m_morebuf+gobuf_pc)(BX)
Russ Cox7343e032009-06-17 15:12:16 -0700461 LEAQ 16(SP), AX // f's caller's SP
Russ Coxe473f422010-08-04 17:50:22 -0700462 MOVQ AX, (m_morebuf+gobuf_sp)(BX)
Russ Coxe473f422010-08-04 17:50:22 -0700463 get_tls(CX)
464 MOVQ g(CX), SI
465 MOVQ SI, (m_morebuf+gobuf_g)(BX)
Russ Cox7343e032009-06-17 15:12:16 -0700466
Russ Cox6fa3c892013-06-27 11:32:01 -0400467 // Set g->sched to context in f.
468 MOVQ 0(SP), AX // f's PC
469 MOVQ AX, (g_sched+gobuf_pc)(SI)
470 MOVQ SI, (g_sched+gobuf_g)(SI)
471 LEAQ 8(SP), AX // f's SP
472 MOVQ AX, (g_sched+gobuf_sp)(SI)
Austin Clements3c0fee12015-01-14 11:09:50 -0500473 MOVQ BP, (g_sched+gobuf_bp)(SI)
Austin Clements3beaf262017-10-22 21:37:05 -0400474 MOVQ DX, (g_sched+gobuf_ctxt)(SI)
Russ Cox7343e032009-06-17 15:12:16 -0700475
Russ Coxf9ca3b52011-03-07 10:37:42 -0500476 // Call newstack on m->g0's stack.
Austin Clements20a6ff72015-01-27 18:29:02 -0500477 MOVQ m_g0(BX), BX
478 MOVQ BX, g(CX)
479 MOVQ (g_sched+gobuf_sp)(BX), SP
Russ Cox68b42552010-11-04 14:00:19 -0400480 CALL runtime·newstack(SB)
Russ Cox7343e032009-06-17 15:12:16 -0700481 MOVQ $0, 0x1003 // crash if newstack returns
482 RET
483
Russ Cox15b76ad2014-09-09 13:39:57 -0400484// morestack but not preserving ctxt.
485TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
486 MOVL $0, DX
487 JMP runtime·morestack(SB)
488
Keith Randall52631982014-09-08 10:14:41 -0700489// reflectcall: call a function with the given argument list
Russ Coxdf027ac2014-12-30 13:59:55 -0500490// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
Keith Randall9cd57062013-08-02 13:03:14 -0700491// we don't have variable-sized frames, so we use a small number
492// of constant-sized-frame functions to encode a few bits of size in the pc.
493// Caution: ugly multiline assembly macros in your future!
494
495#define DISPATCH(NAME,MAXSIZE) \
496 CMPQ CX, $MAXSIZE; \
497 JA 3(PC); \
Russ Coxcb6f5ac2014-10-15 13:12:16 -0400498 MOVQ $NAME(SB), AX; \
Keith Randall9cd57062013-08-02 13:03:14 -0700499 JMP AX
Rob Pikeaff78832014-07-30 10:11:44 -0700500// Note: can't just "JMP NAME(SB)" - bad inlining results.
Keith Randall9cd57062013-08-02 13:03:14 -0700501
Russ Cox7a524a12014-12-22 13:27:53 -0500502TEXT reflect·call(SB), NOSPLIT, $0-0
503 JMP ·reflectcall(SB)
504
Russ Coxdf027ac2014-12-30 13:59:55 -0500505TEXT ·reflectcall(SB), NOSPLIT, $0-32
506 MOVLQZX argsize+24(FP), CX
Rob Pikeaff78832014-07-30 10:11:44 -0700507 DISPATCH(runtime·call32, 32)
508 DISPATCH(runtime·call64, 64)
509 DISPATCH(runtime·call128, 128)
510 DISPATCH(runtime·call256, 256)
511 DISPATCH(runtime·call512, 512)
512 DISPATCH(runtime·call1024, 1024)
513 DISPATCH(runtime·call2048, 2048)
514 DISPATCH(runtime·call4096, 4096)
515 DISPATCH(runtime·call8192, 8192)
516 DISPATCH(runtime·call16384, 16384)
517 DISPATCH(runtime·call32768, 32768)
518 DISPATCH(runtime·call65536, 65536)
519 DISPATCH(runtime·call131072, 131072)
520 DISPATCH(runtime·call262144, 262144)
521 DISPATCH(runtime·call524288, 524288)
522 DISPATCH(runtime·call1048576, 1048576)
523 DISPATCH(runtime·call2097152, 2097152)
524 DISPATCH(runtime·call4194304, 4194304)
525 DISPATCH(runtime·call8388608, 8388608)
526 DISPATCH(runtime·call16777216, 16777216)
527 DISPATCH(runtime·call33554432, 33554432)
528 DISPATCH(runtime·call67108864, 67108864)
529 DISPATCH(runtime·call134217728, 134217728)
530 DISPATCH(runtime·call268435456, 268435456)
531 DISPATCH(runtime·call536870912, 536870912)
532 DISPATCH(runtime·call1073741824, 1073741824)
Keith Randall9cd57062013-08-02 13:03:14 -0700533 MOVQ $runtime·badreflectcall(SB), AX
534 JMP AX
535
Keith Randall12e46e42013-08-06 14:33:55 -0700536#define CALLFN(NAME,MAXSIZE) \
Russ Coxdf027ac2014-12-30 13:59:55 -0500537TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \
Russ Coxcb6f5ac2014-10-15 13:12:16 -0400538 NO_LOCAL_POINTERS; \
Keith Randall9cd57062013-08-02 13:03:14 -0700539 /* copy arguments to stack */ \
Russ Coxdf027ac2014-12-30 13:59:55 -0500540 MOVQ argptr+16(FP), SI; \
541 MOVLQZX argsize+24(FP), CX; \
Keith Randall9cd57062013-08-02 13:03:14 -0700542 MOVQ SP, DI; \
543 REP;MOVSB; \
544 /* call function */ \
Russ Coxdf027ac2014-12-30 13:59:55 -0500545 MOVQ f+8(FP), DX; \
Keith Randallcee8bca2014-05-21 14:28:34 -0700546 PCDATA $PCDATA_StackMapIndex, $0; \
Keith Randall9cd57062013-08-02 13:03:14 -0700547 CALL (DX); \
548 /* copy return values back */ \
Austin Clements79561a82016-10-20 22:45:18 -0400549 MOVQ argtype+0(FP), DX; \
Russ Coxdf027ac2014-12-30 13:59:55 -0500550 MOVQ argptr+16(FP), DI; \
551 MOVLQZX argsize+24(FP), CX; \
Austin Clements79561a82016-10-20 22:45:18 -0400552 MOVLQZX retoffset+28(FP), BX; \
Keith Randall9cd57062013-08-02 13:03:14 -0700553 MOVQ SP, SI; \
Russ Cox72c5d5e2014-04-08 11:11:35 -0400554 ADDQ BX, DI; \
555 ADDQ BX, SI; \
556 SUBQ BX, CX; \
Austin Clements79561a82016-10-20 22:45:18 -0400557 CALL callRet<>(SB); \
558 RET
559
560// callRet copies return values back at the end of call*. This is a
561// separate function so it can allocate stack space for the arguments
562// to reflectcallmove. It does not follow the Go ABI; it expects its
563// arguments in registers.
564TEXT callRet<>(SB), NOSPLIT, $32-0
565 NO_LOCAL_POINTERS
566 MOVQ DX, 0(SP)
567 MOVQ DI, 8(SP)
568 MOVQ SI, 16(SP)
569 MOVQ CX, 24(SP)
570 CALL runtime·reflectcallmove(SB)
Keith Randall9cd57062013-08-02 13:03:14 -0700571 RET
572
Russ Coxcb6f5ac2014-10-15 13:12:16 -0400573CALLFN(·call32, 32)
574CALLFN(·call64, 64)
575CALLFN(·call128, 128)
576CALLFN(·call256, 256)
577CALLFN(·call512, 512)
578CALLFN(·call1024, 1024)
579CALLFN(·call2048, 2048)
580CALLFN(·call4096, 4096)
581CALLFN(·call8192, 8192)
582CALLFN(·call16384, 16384)
583CALLFN(·call32768, 32768)
584CALLFN(·call65536, 65536)
585CALLFN(·call131072, 131072)
586CALLFN(·call262144, 262144)
587CALLFN(·call524288, 524288)
588CALLFN(·call1048576, 1048576)
589CALLFN(·call2097152, 2097152)
590CALLFN(·call4194304, 4194304)
591CALLFN(·call8388608, 8388608)
592CALLFN(·call16777216, 16777216)
593CALLFN(·call33554432, 33554432)
594CALLFN(·call67108864, 67108864)
595CALLFN(·call134217728, 134217728)
596CALLFN(·call268435456, 268435456)
597CALLFN(·call536870912, 536870912)
598CALLFN(·call1073741824, 1073741824)
Keith Randall9cd57062013-08-02 13:03:14 -0700599
Keith Randall5a546962013-08-07 10:23:24 -0700600TEXT runtime·procyield(SB),NOSPLIT,$0-0
Russ Cox25f6b022014-08-27 11:32:17 -0400601 MOVL cycles+0(FP), AX
Dmitriy Vyukov4e5086b2011-07-29 12:44:06 -0400602again:
603 PAUSE
604 SUBL $1, AX
605 JNZ again
606 RET
607
Russ Cox631d6a32015-03-19 19:42:16 -0400608
Austin Clementsf5d494b2015-06-15 12:30:23 -0400609TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
610 // Stores are already ordered on x86, so this is just a
611 // compile barrier.
612 RET
613
Russ Coxaa3222d82009-06-02 23:02:12 -0700614// void jmpdefer(fn, sp);
615// called from deferreturn.
Ken Thompson1e1cc4e2009-01-27 12:03:53 -0800616// 1. pop the caller
617// 2. sub 5 bytes from the callers return
618// 3. jmp to the argument
Keith Randalla97a91d2013-08-07 14:03:50 -0700619TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
Russ Cox25f6b022014-08-27 11:32:17 -0400620 MOVQ fv+0(FP), DX // fn
621 MOVQ argp+8(FP), BX // caller sp
Russ Coxaa3222d82009-06-02 23:02:12 -0700622 LEAQ -8(BX), SP // caller sp after CALL
Austin Clementsb92f4232016-05-25 20:56:56 -0400623 MOVQ -8(SP), BP // restore BP as if deferreturn returned (harmless if framepointers not in use)
Russ Coxaa3222d82009-06-02 23:02:12 -0700624 SUBQ $5, (SP) // return to CALL again
Russ Cox6066fdc2013-02-22 10:47:54 -0500625 MOVQ 0(DX), BX
Russ Cox1903ad72013-02-21 17:01:13 -0500626 JMP BX // but first run the deferred function
Russ Cox133a1582009-10-03 10:37:12 -0700627
Russ Coxd67e7e32013-06-12 15:22:26 -0400628// Save state of caller into g->sched. Smashes R8, R9.
Keith Randall5a546962013-08-07 10:23:24 -0700629TEXT gosave<>(SB),NOSPLIT,$0
Russ Coxd67e7e32013-06-12 15:22:26 -0400630 get_tls(R8)
631 MOVQ g(R8), R8
632 MOVQ 0(SP), R9
633 MOVQ R9, (g_sched+gobuf_pc)(R8)
634 LEAQ 8(SP), R9
635 MOVQ R9, (g_sched+gobuf_sp)(R8)
636 MOVQ $0, (g_sched+gobuf_ret)(R8)
Austin Clements3c0fee12015-01-14 11:09:50 -0500637 MOVQ BP, (g_sched+gobuf_bp)(R8)
Austin Clements70c107c2016-10-19 15:49:31 -0400638 // Assert ctxt is zero. See func save.
639 MOVQ (g_sched+gobuf_ctxt)(R8), R9
640 TESTQ R9, R9
641 JZ 2(PC)
642 CALL runtime·badctxt(SB)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500643 RET
644
Alex Brainman9d968cb2015-04-27 17:32:23 +1000645// func asmcgocall(fn, arg unsafe.Pointer) int32
Russ Coxadd89dd2009-10-12 10:26:38 -0700646// Call fn(arg) on the scheduler stack,
647// aligned appropriately for the gcc ABI.
Alex Brainman9d968cb2015-04-27 17:32:23 +1000648// See cgocall.go for more details.
649TEXT ·asmcgocall(SB),NOSPLIT,$0-20
Russ Coxf9ca3b52011-03-07 10:37:42 -0500650 MOVQ fn+0(FP), AX
651 MOVQ arg+8(FP), BX
Russ Coxcb767242014-09-04 00:01:55 -0400652
Russ Coxf9ca3b52011-03-07 10:37:42 -0500653 MOVQ SP, DX
Russ Coxadd89dd2009-10-12 10:26:38 -0700654
655 // Figure out if we need to switch to m->g0 stack.
Russ Coxf9ca3b52011-03-07 10:37:42 -0500656 // We get called to create new OS threads too, and those
657 // come in on the m->g0 stack already.
658 get_tls(CX)
Austin Clements20a6ff72015-01-27 18:29:02 -0500659 MOVQ g(CX), R8
Russ Cox3af29fb2015-11-19 15:51:39 -0500660 CMPQ R8, $0
661 JEQ nosave
Austin Clements20a6ff72015-01-27 18:29:02 -0500662 MOVQ g_m(R8), R8
663 MOVQ m_g0(R8), SI
Russ Coxf9ca3b52011-03-07 10:37:42 -0500664 MOVQ g(CX), DI
665 CMPQ SI, DI
Aram Hăvărneanua46b4342014-01-17 17:58:10 +1300666 JEQ nosave
Austin Clements20a6ff72015-01-27 18:29:02 -0500667 MOVQ m_gsignal(R8), SI
Aram Hăvărneanua46b4342014-01-17 17:58:10 +1300668 CMPQ SI, DI
669 JEQ nosave
670
Russ Cox3af29fb2015-11-19 15:51:39 -0500671 // Switch to system stack.
Austin Clements20a6ff72015-01-27 18:29:02 -0500672 MOVQ m_g0(R8), SI
Russ Coxd67e7e32013-06-12 15:22:26 -0400673 CALL gosave<>(SB)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500674 MOVQ SI, g(CX)
675 MOVQ (g_sched+gobuf_sp)(SI), SP
Russ Coxadd89dd2009-10-12 10:26:38 -0700676
677 // Now on a scheduling stack (a pthread-created stack).
Alex Brainman7f075ec2012-09-03 12:12:51 +1000678 // Make sure we have enough room for 4 stack-backed fast-call
679 // registers as per windows amd64 calling convention.
680 SUBQ $64, SP
Russ Cox133a1582009-10-03 10:37:12 -0700681 ANDQ $~15, SP // alignment for gcc ABI
Alex Brainman7f075ec2012-09-03 12:12:51 +1000682 MOVQ DI, 48(SP) // save g
Keith Randall47f251c2014-09-11 20:36:23 -0700683 MOVQ (g_stack+stack_hi)(DI), DI
684 SUBQ DX, DI
685 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500686 MOVQ BX, DI // DI = first argument in AMD64 ABI
Wei Guangjing9f636592011-07-19 10:47:33 -0400687 MOVQ BX, CX // CX = first argument in Win64
Russ Coxf9ca3b52011-03-07 10:37:42 -0500688 CALL AX
Russ Coxadd89dd2009-10-12 10:26:38 -0700689
Russ Coxe473f422010-08-04 17:50:22 -0700690 // Restore registers, g, stack pointer.
Russ Coxf9ca3b52011-03-07 10:37:42 -0500691 get_tls(CX)
Alex Brainman7f075ec2012-09-03 12:12:51 +1000692 MOVQ 48(SP), DI
Keith Randall47f251c2014-09-11 20:36:23 -0700693 MOVQ (g_stack+stack_hi)(DI), SI
694 SUBQ 40(SP), SI
Russ Coxf9ca3b52011-03-07 10:37:42 -0500695 MOVQ DI, g(CX)
Keith Randall47f251c2014-09-11 20:36:23 -0700696 MOVQ SI, SP
Alex Brainman9d968cb2015-04-27 17:32:23 +1000697
698 MOVL AX, ret+16(FP)
Russ Cox133a1582009-10-03 10:37:12 -0700699 RET
700
Russ Cox3af29fb2015-11-19 15:51:39 -0500701nosave:
702 // Running on a system stack, perhaps even without a g.
703 // Having no g can happen during thread creation or thread teardown
704 // (see needm/dropm on Solaris, for example).
705 // This code is like the above sequence but without saving/restoring g
706 // and without worrying about the stack moving out from under us
707 // (because we're on a system stack, not a goroutine stack).
708 // The above code could be used directly if already on a system stack,
709 // but then the only path through this code would be a rare case on Solaris.
710 // Using this code for all "already on system stack" calls exercises it more,
711 // which should help keep it correct.
712 SUBQ $64, SP
713 ANDQ $~15, SP
714 MOVQ $0, 48(SP) // where above code stores g, in case someone looks during debugging
715 MOVQ DX, 40(SP) // save original stack pointer
716 MOVQ BX, DI // DI = first argument in AMD64 ABI
717 MOVQ BX, CX // CX = first argument in Win64
718 CALL AX
719 MOVQ 40(SP), SI // restore original stack pointer
720 MOVQ SI, SP
721 MOVL AX, ret+16(FP)
722 RET
723
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700724// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
Russ Cox3d2dfc52013-02-22 16:08:56 -0500725// Turn the fn into a Go func (by taking its address) and call
726// cgocallback_gofunc.
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700727TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
Russ Cox3d2dfc52013-02-22 16:08:56 -0500728 LEAQ fn+0(FP), AX
729 MOVQ AX, 0(SP)
730 MOVQ frame+8(FP), AX
731 MOVQ AX, 8(SP)
732 MOVQ framesize+16(FP), AX
733 MOVQ AX, 16(SP)
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700734 MOVQ ctxt+24(FP), AX
735 MOVQ AX, 24(SP)
Russ Cox3d2dfc52013-02-22 16:08:56 -0500736 MOVQ $runtime·cgocallback_gofunc(SB), AX
737 CALL AX
738 RET
739
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700740// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
Alex Brainman9d968cb2015-04-27 17:32:23 +1000741// See cgocall.go for more details.
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700742TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
Russ Coxe844f532014-09-12 07:46:11 -0400743 NO_LOCAL_POINTERS
744
Russ Cox89f185f2014-06-26 11:54:39 -0400745 // If g is nil, Go did not create the current thread.
746 // Call needm to obtain one m for temporary use.
Russ Cox6c976392013-02-20 17:48:23 -0500747 // In this case, we're running on the thread stack, so there's
748 // lots of space, but the linker doesn't know. Hide the call from
749 // the linker analysis by using an indirect call through AX.
750 get_tls(CX)
751#ifdef GOOS_windows
Austin Clements20a6ff72015-01-27 18:29:02 -0500752 MOVL $0, BX
Russ Cox6c976392013-02-20 17:48:23 -0500753 CMPQ CX, $0
Russ Coxcefdb9c2013-07-23 22:59:32 -0400754 JEQ 2(PC)
Russ Cox6c976392013-02-20 17:48:23 -0500755#endif
Austin Clements20a6ff72015-01-27 18:29:02 -0500756 MOVQ g(CX), BX
757 CMPQ BX, $0
Russ Cox89f185f2014-06-26 11:54:39 -0400758 JEQ needm
Austin Clements20a6ff72015-01-27 18:29:02 -0500759 MOVQ g_m(BX), BX
760 MOVQ BX, R8 // holds oldm until end of function
Russ Cox89f185f2014-06-26 11:54:39 -0400761 JMP havem
Russ Cox6c976392013-02-20 17:48:23 -0500762needm:
Russ Cox89f185f2014-06-26 11:54:39 -0400763 MOVQ $0, 0(SP)
Russ Cox6c976392013-02-20 17:48:23 -0500764 MOVQ $runtime·needm(SB), AX
765 CALL AX
Russ Coxf0112822013-07-24 09:01:57 -0400766 MOVQ 0(SP), R8
Russ Coxe473f422010-08-04 17:50:22 -0700767 get_tls(CX)
Austin Clements20a6ff72015-01-27 18:29:02 -0500768 MOVQ g(CX), BX
769 MOVQ g_m(BX), BX
Russ Coxc4efaac2014-10-28 21:53:09 -0400770
771 // Set m->sched.sp = SP, so that if a panic happens
772 // during the function we are about to execute, it will
773 // have a valid SP to run on the g0 stack.
774 // The next few lines (after the havem label)
775 // will save this SP onto the stack and then write
776 // the same SP back to m->sched.sp. That seems redundant,
777 // but if an unrecovered panic happens, unwindm will
778 // restore the g->sched.sp from the stack location
Russ Cox656be312014-11-12 14:54:31 -0500779 // and then systemstack will try to use it. If we don't set it here,
Russ Coxc4efaac2014-10-28 21:53:09 -0400780 // that restored SP will be uninitialized (typically 0) and
781 // will not be usable.
Austin Clements20a6ff72015-01-27 18:29:02 -0500782 MOVQ m_g0(BX), SI
Russ Coxc4efaac2014-10-28 21:53:09 -0400783 MOVQ SP, (g_sched+gobuf_sp)(SI)
Russ Cox9b732382012-03-08 12:12:40 -0500784
Russ Cox6c976392013-02-20 17:48:23 -0500785havem:
786 // Now there's a valid m, and we're running on its m->g0.
787 // Save current m->g0->sched.sp on stack and then set it to SP.
788 // Save current sp in m->g0->sched.sp in preparation for
789 // switch back to m->curg stack.
Russ Coxdba623b2013-07-23 18:40:02 -0400790 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
Austin Clements20a6ff72015-01-27 18:29:02 -0500791 MOVQ m_g0(BX), SI
Russ Coxdba623b2013-07-23 18:40:02 -0400792 MOVQ (g_sched+gobuf_sp)(SI), AX
793 MOVQ AX, 0(SP)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500794 MOVQ SP, (g_sched+gobuf_sp)(SI)
Ian Lance Taylor2d0ff3f2010-04-09 13:30:11 -0700795
Russ Coxdba623b2013-07-23 18:40:02 -0400796 // Switch to m->curg stack and call runtime.cgocallbackg.
797 // Because we are taking over the execution of m->curg
798 // but *not* resuming what had been running, we need to
799 // save that information (m->curg->sched) so we can restore it.
Russ Cox528534c2013-06-05 07:16:53 -0400800 // We can restore m->curg->sched.sp easily, because calling
Alex Brainman72e83482011-08-18 12:17:09 -0400801 // runtime.cgocallbackg leaves SP unchanged upon return.
Russ Cox528534c2013-06-05 07:16:53 -0400802 // To save m->curg->sched.pc, we push it onto the stack.
Russ Coxf9ca3b52011-03-07 10:37:42 -0500803 // This has the added benefit that it looks to the traceback
Alex Brainman72e83482011-08-18 12:17:09 -0400804 // routine like cgocallbackg is going to return to that
Russ Coxdba623b2013-07-23 18:40:02 -0400805 // PC (because the frame we allocate below has the same
806 // size as cgocallback_gofunc's frame declared above)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500807 // so that the traceback will seamlessly trace back into
808 // the earlier calls.
Russ Coxdba623b2013-07-23 18:40:02 -0400809 //
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700810 // In the new goroutine, 8(SP) holds the saved R8.
Austin Clements20a6ff72015-01-27 18:29:02 -0500811 MOVQ m_curg(BX), SI
Russ Coxf9ca3b52011-03-07 10:37:42 -0500812 MOVQ SI, g(CX)
813 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI
Austin Clements20a6ff72015-01-27 18:29:02 -0500814 MOVQ (g_sched+gobuf_pc)(SI), BX
815 MOVQ BX, -8(DI)
Austin Clements3c0fee12015-01-14 11:09:50 -0500816 // Compute the size of the frame, including return PC and, if
Keith Randall1ea60c12016-12-02 15:17:52 -0800817 // GOEXPERIMENT=framepointer, the saved base pointer
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700818 MOVQ ctxt+24(FP), BX
Rob Pikec21f1d52015-02-19 13:44:06 -0800819 LEAQ fv+0(FP), AX
Austin Clements3c0fee12015-01-14 11:09:50 -0500820 SUBQ SP, AX
821 SUBQ AX, DI
822 MOVQ DI, SP
823
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700824 MOVQ R8, 8(SP)
825 MOVQ BX, 0(SP)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500826 CALL runtime·cgocallbackg(SB)
Ian Lance Taylor5f9a8702016-04-27 14:18:29 -0700827 MOVQ 8(SP), R8
Russ Coxf9ca3b52011-03-07 10:37:42 -0500828
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +0000829 // Compute the size of the frame again. FP and SP have
Austin Clements3c0fee12015-01-14 11:09:50 -0500830 // completely different values here than they did above,
831 // but only their difference matters.
Rob Pikec21f1d52015-02-19 13:44:06 -0800832 LEAQ fv+0(FP), AX
Austin Clements3c0fee12015-01-14 11:09:50 -0500833 SUBQ SP, AX
834
Russ Cox528534c2013-06-05 07:16:53 -0400835 // Restore g->sched (== m->curg->sched) from saved values.
Russ Coxe473f422010-08-04 17:50:22 -0700836 get_tls(CX)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500837 MOVQ g(CX), SI
Austin Clements3c0fee12015-01-14 11:09:50 -0500838 MOVQ SP, DI
839 ADDQ AX, DI
840 MOVQ -8(DI), BX
Austin Clements20a6ff72015-01-27 18:29:02 -0500841 MOVQ BX, (g_sched+gobuf_pc)(SI)
Russ Coxf9ca3b52011-03-07 10:37:42 -0500842 MOVQ DI, (g_sched+gobuf_sp)(SI)
843
844 // Switch back to m->g0's stack and restore m->g0->sched.sp.
845 // (Unlike m->curg, the g0 goroutine never uses sched.pc,
846 // so we do not have to restore it.)
Austin Clements20a6ff72015-01-27 18:29:02 -0500847 MOVQ g(CX), BX
848 MOVQ g_m(BX), BX
849 MOVQ m_g0(BX), SI
Russ Coxf9ca3b52011-03-07 10:37:42 -0500850 MOVQ SI, g(CX)
851 MOVQ (g_sched+gobuf_sp)(SI), SP
Russ Coxdba623b2013-07-23 18:40:02 -0400852 MOVQ 0(SP), AX
853 MOVQ AX, (g_sched+gobuf_sp)(SI)
Russ Cox6c976392013-02-20 17:48:23 -0500854
855 // If the m on entry was nil, we called needm above to borrow an m
856 // for the duration of the call. Since the call is over, return it with dropm.
Russ Coxf0112822013-07-24 09:01:57 -0400857 CMPQ R8, $0
Russ Cox6c976392013-02-20 17:48:23 -0500858 JNE 3(PC)
859 MOVQ $runtime·dropm(SB), AX
860 CALL AX
Russ Coxf9ca3b52011-03-07 10:37:42 -0500861
862 // Done!
Ian Lance Taylor2d0ff3f2010-04-09 13:30:11 -0700863 RET
864
Russ Cox89f185f2014-06-26 11:54:39 -0400865// void setg(G*); set g. for use by needm.
Russ Cox25f6b022014-08-27 11:32:17 -0400866TEXT runtime·setg(SB), NOSPLIT, $0-8
Russ Cox89f185f2014-06-26 11:54:39 -0400867 MOVQ gg+0(FP), BX
Russ Cox6c976392013-02-20 17:48:23 -0500868#ifdef GOOS_windows
Russ Cox89f185f2014-06-26 11:54:39 -0400869 CMPQ BX, $0
Russ Cox6c976392013-02-20 17:48:23 -0500870 JNE settls
871 MOVQ $0, 0x28(GS)
872 RET
873settls:
Russ Cox89f185f2014-06-26 11:54:39 -0400874 MOVQ g_m(BX), AX
Russ Cox6c976392013-02-20 17:48:23 -0500875 LEAQ m_tls(AX), AX
876 MOVQ AX, 0x28(GS)
877#endif
878 get_tls(CX)
Russ Cox6c976392013-02-20 17:48:23 -0500879 MOVQ BX, g(CX)
880 RET
881
Russ Cox89f185f2014-06-26 11:54:39 -0400882// void setg_gcc(G*); set g called from gcc.
883TEXT setg_gcc<>(SB),NOSPLIT,$0
Russ Cox6a70f9d2013-03-25 18:14:02 -0400884 get_tls(AX)
Russ Cox89f185f2014-06-26 11:54:39 -0400885 MOVQ DI, g(AX)
Russ Cox6a70f9d2013-03-25 18:14:02 -0400886 RET
887
Russ Cox15b76ad2014-09-09 13:39:57 -0400888// check that SP is in range [g->stack.lo, g->stack.hi)
Keith Randall5a546962013-08-07 10:23:24 -0700889TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
Russ Coxe473f422010-08-04 17:50:22 -0700890 get_tls(CX)
891 MOVQ g(CX), AX
Russ Cox15b76ad2014-09-09 13:39:57 -0400892 CMPQ (g_stack+stack_hi)(AX), SP
Russ Cox01eaf782010-03-30 10:53:16 -0700893 JHI 2(PC)
894 INT $3
Russ Cox15b76ad2014-09-09 13:39:57 -0400895 CMPQ SP, (g_stack+stack_lo)(AX)
Russ Cox01eaf782010-03-30 10:53:16 -0700896 JHI 2(PC)
897 INT $3
898 RET
899
Dmitry Vyukov6e70fdd2015-02-17 14:25:49 +0300900// func cputicks() int64
Keith Randall5a546962013-08-07 10:23:24 -0700901TEXT runtime·cputicks(SB),NOSPLIT,$0-0
Dmitry Vyukov6e70fdd2015-02-17 14:25:49 +0300902 CMPB runtime·lfenceBeforeRdtsc(SB), $1
903 JNE mfence
Ilya Tocar1d1f2fb2016-01-13 16:43:22 +0300904 LFENCE
Dmitry Vyukov6e70fdd2015-02-17 14:25:49 +0300905 JMP done
906mfence:
Ilya Tocar1d1f2fb2016-01-13 16:43:22 +0300907 MFENCE
Dmitry Vyukov6e70fdd2015-02-17 14:25:49 +0300908done:
Damian Gryski8e765da2012-02-02 14:09:27 -0500909 RDTSC
910 SHLQ $32, DX
911 ADDQ DX, AX
Russ Cox25f6b022014-08-27 11:32:17 -0400912 MOVQ AX, ret+0(FP)
Damian Gryski8e765da2012-02-02 14:09:27 -0500913 RET
914
Keith Randalla5d40242013-03-12 10:47:44 -0700915// hash function using AES hardware instructions
Keith Randalla2a97682014-07-31 15:07:05 -0700916TEXT runtime·aeshash(SB),NOSPLIT,$0-32
917 MOVQ p+0(FP), AX // ptr to data
Keith Randalld5e4c402015-01-06 16:42:48 -0800918 MOVQ s+16(FP), CX // size
919 LEAQ ret+24(FP), DX
Keith Randalla5d40242013-03-12 10:47:44 -0700920 JMP runtime·aeshashbody(SB)
921
Keith Randalld5e4c402015-01-06 16:42:48 -0800922TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
Keith Randalla2a97682014-07-31 15:07:05 -0700923 MOVQ p+0(FP), AX // ptr to string struct
Keith Randalla5d40242013-03-12 10:47:44 -0700924 MOVQ 8(AX), CX // length of string
925 MOVQ (AX), AX // string data
Keith Randalld5e4c402015-01-06 16:42:48 -0800926 LEAQ ret+16(FP), DX
Keith Randalla5d40242013-03-12 10:47:44 -0700927 JMP runtime·aeshashbody(SB)
928
929// AX: data
930// CX: length
Keith Randalld5e4c402015-01-06 16:42:48 -0800931// DX: address to put return value
932TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
Keith Randall91059de2015-08-31 16:26:12 -0700933 // Fill an SSE register with our seeds.
934 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed
935 PINSRW $4, CX, X0 // 16 bits of length
936 PSHUFHW $0, X0, X0 // repeat length 4 times total
937 MOVO X0, X1 // save unscrambled seed
938 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed
939 AESENC X0, X0 // scramble seed
940
Keith Randallee669722013-05-15 09:40:14 -0700941 CMPQ CX, $16
Keith Randall7a4a64e2014-12-10 14:20:17 -0800942 JB aes0to15
943 JE aes16
944 CMPQ CX, $32
945 JBE aes17to32
946 CMPQ CX, $64
947 JBE aes33to64
948 CMPQ CX, $128
949 JBE aes65to128
950 JMP aes129plus
Keith Randalla5d40242013-03-12 10:47:44 -0700951
Keith Randall7a4a64e2014-12-10 14:20:17 -0800952aes0to15:
953 TESTQ CX, CX
954 JE aes0
955
956 ADDQ $16, AX
957 TESTW $0xff0, AX
958 JE endofpage
Keith Randalla5d40242013-03-12 10:47:44 -0700959
Keith Randallee669722013-05-15 09:40:14 -0700960 // 16 bytes loaded at this address won't cross
961 // a page boundary, so we can load it directly.
Keith Randall91059de2015-08-31 16:26:12 -0700962 MOVOU -16(AX), X1
Keith Randalla5d40242013-03-12 10:47:44 -0700963 ADDQ CX, CX
Austin Clements20a6ff72015-01-27 18:29:02 -0500964 MOVQ $masks<>(SB), AX
Keith Randall91059de2015-08-31 16:26:12 -0700965 PAND (AX)(CX*8), X1
966final1:
Keith Randallc83e6f52016-05-26 08:56:49 -0700967 PXOR X0, X1 // xor data with seed
968 AESENC X1, X1 // scramble combo 3 times
969 AESENC X1, X1
Keith Randall91059de2015-08-31 16:26:12 -0700970 AESENC X1, X1
971 MOVQ X1, (DX)
Keith Randall7a4a64e2014-12-10 14:20:17 -0800972 RET
973
974endofpage:
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +0000975 // address ends in 1111xxxx. Might be up against
Keith Randalla5d40242013-03-12 10:47:44 -0700976 // a page boundary, so load ending at last byte.
977 // Then shift bytes down using pshufb.
Keith Randall91059de2015-08-31 16:26:12 -0700978 MOVOU -32(AX)(CX*1), X1
Keith Randalla5d40242013-03-12 10:47:44 -0700979 ADDQ CX, CX
Austin Clements20a6ff72015-01-27 18:29:02 -0500980 MOVQ $shifts<>(SB), AX
Keith Randall91059de2015-08-31 16:26:12 -0700981 PSHUFB (AX)(CX*8), X1
982 JMP final1
Keith Randalla5d40242013-03-12 10:47:44 -0700983
Keith Randall7a4a64e2014-12-10 14:20:17 -0800984aes0:
Keith Randall731bdc52015-09-01 12:53:15 -0700985 // Return scrambled input seed
Keith Randall91059de2015-08-31 16:26:12 -0700986 AESENC X0, X0
987 MOVQ X0, (DX)
Keith Randall7a4a64e2014-12-10 14:20:17 -0800988 RET
989
990aes16:
Keith Randall91059de2015-08-31 16:26:12 -0700991 MOVOU (AX), X1
992 JMP final1
Keith Randall7a4a64e2014-12-10 14:20:17 -0800993
994aes17to32:
Keith Randall91059de2015-08-31 16:26:12 -0700995 // make second starting seed
996 PXOR runtime·aeskeysched+16(SB), X1
997 AESENC X1, X1
998
Keith Randall7a4a64e2014-12-10 14:20:17 -0800999 // load data to be hashed
Keith Randall91059de2015-08-31 16:26:12 -07001000 MOVOU (AX), X2
1001 MOVOU -16(AX)(CX*1), X3
Keith Randall7a4a64e2014-12-10 14:20:17 -08001002
Keith Randallc83e6f52016-05-26 08:56:49 -07001003 // xor with seed
1004 PXOR X0, X2
1005 PXOR X1, X3
1006
Keith Randall7a4a64e2014-12-10 14:20:17 -08001007 // scramble 3 times
Keith Randallc83e6f52016-05-26 08:56:49 -07001008 AESENC X2, X2
1009 AESENC X3, X3
Keith Randall91059de2015-08-31 16:26:12 -07001010 AESENC X2, X2
1011 AESENC X3, X3
1012 AESENC X2, X2
1013 AESENC X3, X3
Keith Randall7a4a64e2014-12-10 14:20:17 -08001014
1015 // combine results
Keith Randall91059de2015-08-31 16:26:12 -07001016 PXOR X3, X2
1017 MOVQ X2, (DX)
Keith Randall7a4a64e2014-12-10 14:20:17 -08001018 RET
1019
1020aes33to64:
Keith Randall91059de2015-08-31 16:26:12 -07001021 // make 3 more starting seeds
1022 MOVO X1, X2
1023 MOVO X1, X3
1024 PXOR runtime·aeskeysched+16(SB), X1
1025 PXOR runtime·aeskeysched+32(SB), X2
1026 PXOR runtime·aeskeysched+48(SB), X3
1027 AESENC X1, X1
1028 AESENC X2, X2
1029 AESENC X3, X3
Keith Randall7a4a64e2014-12-10 14:20:17 -08001030
Keith Randall91059de2015-08-31 16:26:12 -07001031 MOVOU (AX), X4
1032 MOVOU 16(AX), X5
1033 MOVOU -32(AX)(CX*1), X6
1034 MOVOU -16(AX)(CX*1), X7
Keith Randallc83e6f52016-05-26 08:56:49 -07001035
1036 PXOR X0, X4
1037 PXOR X1, X5
1038 PXOR X2, X6
1039 PXOR X3, X7
Keith Randall91059de2015-08-31 16:26:12 -07001040
Keith Randallc83e6f52016-05-26 08:56:49 -07001041 AESENC X4, X4
1042 AESENC X5, X5
1043 AESENC X6, X6
1044 AESENC X7, X7
Keith Randall91059de2015-08-31 16:26:12 -07001045
1046 AESENC X4, X4
1047 AESENC X5, X5
1048 AESENC X6, X6
1049 AESENC X7, X7
1050
1051 AESENC X4, X4
1052 AESENC X5, X5
1053 AESENC X6, X6
1054 AESENC X7, X7
Keith Randall7a4a64e2014-12-10 14:20:17 -08001055
Keith Randall91059de2015-08-31 16:26:12 -07001056 PXOR X6, X4
1057 PXOR X7, X5
1058 PXOR X5, X4
1059 MOVQ X4, (DX)
Keith Randall7a4a64e2014-12-10 14:20:17 -08001060 RET
1061
1062aes65to128:
Keith Randall91059de2015-08-31 16:26:12 -07001063 // make 7 more starting seeds
1064 MOVO X1, X2
1065 MOVO X1, X3
1066 MOVO X1, X4
1067 MOVO X1, X5
1068 MOVO X1, X6
1069 MOVO X1, X7
1070 PXOR runtime·aeskeysched+16(SB), X1
1071 PXOR runtime·aeskeysched+32(SB), X2
1072 PXOR runtime·aeskeysched+48(SB), X3
1073 PXOR runtime·aeskeysched+64(SB), X4
1074 PXOR runtime·aeskeysched+80(SB), X5
1075 PXOR runtime·aeskeysched+96(SB), X6
1076 PXOR runtime·aeskeysched+112(SB), X7
1077 AESENC X1, X1
1078 AESENC X2, X2
1079 AESENC X3, X3
1080 AESENC X4, X4
1081 AESENC X5, X5
1082 AESENC X6, X6
1083 AESENC X7, X7
Keith Randall7a4a64e2014-12-10 14:20:17 -08001084
Keith Randall91059de2015-08-31 16:26:12 -07001085 // load data
1086 MOVOU (AX), X8
1087 MOVOU 16(AX), X9
1088 MOVOU 32(AX), X10
1089 MOVOU 48(AX), X11
1090 MOVOU -64(AX)(CX*1), X12
1091 MOVOU -48(AX)(CX*1), X13
1092 MOVOU -32(AX)(CX*1), X14
1093 MOVOU -16(AX)(CX*1), X15
1094
Keith Randallc83e6f52016-05-26 08:56:49 -07001095 // xor with seed
1096 PXOR X0, X8
1097 PXOR X1, X9
1098 PXOR X2, X10
1099 PXOR X3, X11
1100 PXOR X4, X12
1101 PXOR X5, X13
1102 PXOR X6, X14
1103 PXOR X7, X15
Keith Randall91059de2015-08-31 16:26:12 -07001104
Keith Randallc83e6f52016-05-26 08:56:49 -07001105 // scramble 3 times
Keith Randall91059de2015-08-31 16:26:12 -07001106 AESENC X8, X8
1107 AESENC X9, X9
1108 AESENC X10, X10
1109 AESENC X11, X11
1110 AESENC X12, X12
1111 AESENC X13, X13
1112 AESENC X14, X14
1113 AESENC X15, X15
Keith Randallc83e6f52016-05-26 08:56:49 -07001114
1115 AESENC X8, X8
1116 AESENC X9, X9
1117 AESENC X10, X10
1118 AESENC X11, X11
1119 AESENC X12, X12
1120 AESENC X13, X13
1121 AESENC X14, X14
1122 AESENC X15, X15
1123
Keith Randall91059de2015-08-31 16:26:12 -07001124 AESENC X8, X8
1125 AESENC X9, X9
1126 AESENC X10, X10
1127 AESENC X11, X11
1128 AESENC X12, X12
1129 AESENC X13, X13
1130 AESENC X14, X14
1131 AESENC X15, X15
1132
1133 // combine results
1134 PXOR X12, X8
1135 PXOR X13, X9
1136 PXOR X14, X10
1137 PXOR X15, X11
1138 PXOR X10, X8
1139 PXOR X11, X9
1140 PXOR X9, X8
1141 MOVQ X8, (DX)
Keith Randall7a4a64e2014-12-10 14:20:17 -08001142 RET
1143
1144aes129plus:
Keith Randall91059de2015-08-31 16:26:12 -07001145 // make 7 more starting seeds
1146 MOVO X1, X2
1147 MOVO X1, X3
1148 MOVO X1, X4
1149 MOVO X1, X5
1150 MOVO X1, X6
1151 MOVO X1, X7
1152 PXOR runtime·aeskeysched+16(SB), X1
1153 PXOR runtime·aeskeysched+32(SB), X2
1154 PXOR runtime·aeskeysched+48(SB), X3
1155 PXOR runtime·aeskeysched+64(SB), X4
1156 PXOR runtime·aeskeysched+80(SB), X5
1157 PXOR runtime·aeskeysched+96(SB), X6
1158 PXOR runtime·aeskeysched+112(SB), X7
1159 AESENC X1, X1
1160 AESENC X2, X2
1161 AESENC X3, X3
1162 AESENC X4, X4
1163 AESENC X5, X5
1164 AESENC X6, X6
1165 AESENC X7, X7
1166
Keith Randall7a4a64e2014-12-10 14:20:17 -08001167 // start with last (possibly overlapping) block
Keith Randall91059de2015-08-31 16:26:12 -07001168 MOVOU -128(AX)(CX*1), X8
1169 MOVOU -112(AX)(CX*1), X9
1170 MOVOU -96(AX)(CX*1), X10
1171 MOVOU -80(AX)(CX*1), X11
1172 MOVOU -64(AX)(CX*1), X12
1173 MOVOU -48(AX)(CX*1), X13
1174 MOVOU -32(AX)(CX*1), X14
1175 MOVOU -16(AX)(CX*1), X15
Keith Randall7a4a64e2014-12-10 14:20:17 -08001176
Keith Randallc83e6f52016-05-26 08:56:49 -07001177 // xor in seed
1178 PXOR X0, X8
1179 PXOR X1, X9
1180 PXOR X2, X10
1181 PXOR X3, X11
1182 PXOR X4, X12
1183 PXOR X5, X13
1184 PXOR X6, X14
1185 PXOR X7, X15
Keith Randall91059de2015-08-31 16:26:12 -07001186
Keith Randall7a4a64e2014-12-10 14:20:17 -08001187 // compute number of remaining 128-byte blocks
1188 DECQ CX
1189 SHRQ $7, CX
1190
1191aesloop:
Keith Randallc83e6f52016-05-26 08:56:49 -07001192 // scramble state
1193 AESENC X8, X8
1194 AESENC X9, X9
1195 AESENC X10, X10
1196 AESENC X11, X11
1197 AESENC X12, X12
1198 AESENC X13, X13
1199 AESENC X14, X14
1200 AESENC X15, X15
1201
Keith Randall7a4a64e2014-12-10 14:20:17 -08001202 // scramble state, xor in a block
Keith Randall91059de2015-08-31 16:26:12 -07001203 MOVOU (AX), X0
1204 MOVOU 16(AX), X1
1205 MOVOU 32(AX), X2
1206 MOVOU 48(AX), X3
1207 AESENC X0, X8
1208 AESENC X1, X9
1209 AESENC X2, X10
1210 AESENC X3, X11
1211 MOVOU 64(AX), X4
1212 MOVOU 80(AX), X5
1213 MOVOU 96(AX), X6
1214 MOVOU 112(AX), X7
1215 AESENC X4, X12
1216 AESENC X5, X13
1217 AESENC X6, X14
1218 AESENC X7, X15
Keith Randall7a4a64e2014-12-10 14:20:17 -08001219
Keith Randallc83e6f52016-05-26 08:56:49 -07001220 ADDQ $128, AX
1221 DECQ CX
1222 JNE aesloop
1223
1224 // 3 more scrambles to finish
Keith Randall91059de2015-08-31 16:26:12 -07001225 AESENC X8, X8
1226 AESENC X9, X9
1227 AESENC X10, X10
1228 AESENC X11, X11
1229 AESENC X12, X12
1230 AESENC X13, X13
1231 AESENC X14, X14
1232 AESENC X15, X15
Keith Randall91059de2015-08-31 16:26:12 -07001233 AESENC X8, X8
1234 AESENC X9, X9
1235 AESENC X10, X10
1236 AESENC X11, X11
1237 AESENC X12, X12
1238 AESENC X13, X13
1239 AESENC X14, X14
1240 AESENC X15, X15
1241 AESENC X8, X8
1242 AESENC X9, X9
1243 AESENC X10, X10
1244 AESENC X11, X11
1245 AESENC X12, X12
1246 AESENC X13, X13
1247 AESENC X14, X14
1248 AESENC X15, X15
Keith Randall7a4a64e2014-12-10 14:20:17 -08001249
Keith Randall91059de2015-08-31 16:26:12 -07001250 PXOR X12, X8
1251 PXOR X13, X9
1252 PXOR X14, X10
1253 PXOR X15, X11
1254 PXOR X10, X8
1255 PXOR X11, X9
1256 PXOR X9, X8
1257 MOVQ X8, (DX)
Keith Randall7a4a64e2014-12-10 14:20:17 -08001258 RET
1259
Keith Randalld5e4c402015-01-06 16:42:48 -08001260TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
Keith Randalla2a97682014-07-31 15:07:05 -07001261 MOVQ p+0(FP), AX // ptr to data
Keith Randalld5e4c402015-01-06 16:42:48 -08001262 MOVQ h+8(FP), X0 // seed
Keith Randalla5d40242013-03-12 10:47:44 -07001263 PINSRD $2, (AX), X0 // data
Keith Randalldb53d972013-03-20 14:34:26 -07001264 AESENC runtime·aeskeysched+0(SB), X0
1265 AESENC runtime·aeskeysched+16(SB), X0
Keith Randall7a4a64e2014-12-10 14:20:17 -08001266 AESENC runtime·aeskeysched+32(SB), X0
Keith Randalld5e4c402015-01-06 16:42:48 -08001267 MOVQ X0, ret+16(FP)
Keith Randalla5d40242013-03-12 10:47:44 -07001268 RET
1269
Keith Randalld5e4c402015-01-06 16:42:48 -08001270TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
Keith Randalla2a97682014-07-31 15:07:05 -07001271 MOVQ p+0(FP), AX // ptr to data
Keith Randalld5e4c402015-01-06 16:42:48 -08001272 MOVQ h+8(FP), X0 // seed
Keith Randalla5d40242013-03-12 10:47:44 -07001273 PINSRQ $1, (AX), X0 // data
Keith Randalldb53d972013-03-20 14:34:26 -07001274 AESENC runtime·aeskeysched+0(SB), X0
1275 AESENC runtime·aeskeysched+16(SB), X0
Keith Randall7a4a64e2014-12-10 14:20:17 -08001276 AESENC runtime·aeskeysched+32(SB), X0
Keith Randalld5e4c402015-01-06 16:42:48 -08001277 MOVQ X0, ret+16(FP)
Keith Randalla5d40242013-03-12 10:47:44 -07001278 RET
1279
1280// simple mask to get rid of data in the high part of the register.
Russ Cox9ddfb642013-07-16 16:24:09 -04001281DATA masks<>+0x00(SB)/8, $0x0000000000000000
1282DATA masks<>+0x08(SB)/8, $0x0000000000000000
1283DATA masks<>+0x10(SB)/8, $0x00000000000000ff
1284DATA masks<>+0x18(SB)/8, $0x0000000000000000
1285DATA masks<>+0x20(SB)/8, $0x000000000000ffff
1286DATA masks<>+0x28(SB)/8, $0x0000000000000000
1287DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
1288DATA masks<>+0x38(SB)/8, $0x0000000000000000
1289DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
1290DATA masks<>+0x48(SB)/8, $0x0000000000000000
1291DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
1292DATA masks<>+0x58(SB)/8, $0x0000000000000000
1293DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
1294DATA masks<>+0x68(SB)/8, $0x0000000000000000
1295DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
1296DATA masks<>+0x78(SB)/8, $0x0000000000000000
1297DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
1298DATA masks<>+0x88(SB)/8, $0x0000000000000000
1299DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
1300DATA masks<>+0x98(SB)/8, $0x00000000000000ff
1301DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
1302DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
1303DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
1304DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
1305DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
1306DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
1307DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
1308DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
1309DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
1310DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
1311DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
1312DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
Keith Randall5a546962013-08-07 10:23:24 -07001313GLOBL masks<>(SB),RODATA,$256
Keith Randalla5d40242013-03-12 10:47:44 -07001314
Shenghou Ma3583a442015-09-03 02:44:26 -04001315TEXT ·checkASM(SB),NOSPLIT,$0-1
1316 // check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
1317 MOVQ $masks<>(SB), AX
1318 MOVQ $shifts<>(SB), BX
1319 ORQ BX, AX
1320 TESTQ $15, AX
1321 SETEQ ret+0(FP)
1322 RET
1323
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +00001324// these are arguments to pshufb. They move data down from
Russ Cox9ddfb642013-07-16 16:24:09 -04001325// the high bytes of the register to the low bytes of the register.
1326// index is how many bytes to move.
1327DATA shifts<>+0x00(SB)/8, $0x0000000000000000
1328DATA shifts<>+0x08(SB)/8, $0x0000000000000000
1329DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
1330DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
1331DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
1332DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
1333DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
1334DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
1335DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
1336DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
1337DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
1338DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
1339DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
1340DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
1341DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
1342DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
1343DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
1344DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
1345DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
1346DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
1347DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
1348DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
1349DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
1350DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
1351DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
1352DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
1353DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
1354DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
1355DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
1356DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
1357DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
1358DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
Keith Randall5a546962013-08-07 10:23:24 -07001359GLOBL shifts<>(SB),RODATA,$256
Keith Randall3d5daa22013-04-02 16:26:15 -07001360
Keith Randallbd70bd92016-02-22 13:20:38 -08001361// memequal(p, q unsafe.Pointer, size uintptr) bool
1362TEXT runtime·memequal(SB),NOSPLIT,$0-25
Keith Randall0c6b55e2014-07-16 14:16:19 -07001363 MOVQ a+0(FP), SI
1364 MOVQ b+8(FP), DI
Keith Randallbd70bd92016-02-22 13:20:38 -08001365 CMPQ SI, DI
1366 JEQ eq
Keith Randall0c6b55e2014-07-16 14:16:19 -07001367 MOVQ size+16(FP), BX
Keith Randallc526f3a2015-04-21 14:22:41 -07001368 LEAQ ret+24(FP), AX
1369 JMP runtime·memeqbody(SB)
Keith Randallbd70bd92016-02-22 13:20:38 -08001370eq:
1371 MOVB $1, ret+24(FP)
1372 RET
Keith Randall0c6b55e2014-07-16 14:16:19 -07001373
Keith Randalld5e4c402015-01-06 16:42:48 -08001374// memequal_varlen(a, b unsafe.Pointer) bool
1375TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
1376 MOVQ a+0(FP), SI
1377 MOVQ b+8(FP), DI
1378 CMPQ SI, DI
1379 JEQ eq
1380 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure
Keith Randallc526f3a2015-04-21 14:22:41 -07001381 LEAQ ret+16(FP), AX
1382 JMP runtime·memeqbody(SB)
Keith Randalld5e4c402015-01-06 16:42:48 -08001383eq:
1384 MOVB $1, ret+16(FP)
1385 RET
1386
Keith Randall3d5daa22013-04-02 16:26:15 -07001387// a in SI
1388// b in DI
1389// count in BX
Keith Randallc526f3a2015-04-21 14:22:41 -07001390// address of result byte in AX
Keith Randall5a546962013-08-07 10:23:24 -07001391TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
Keith Randall3d5daa22013-04-02 16:26:15 -07001392 CMPQ BX, $8
1393 JB small
Ilya Tocar967564b2015-10-29 17:17:05 +03001394 CMPQ BX, $64
1395 JB bigloop
1396 CMPB runtime·support_avx2(SB), $1
1397 JE hugeloop_avx2
Keith Randall3d5daa22013-04-02 16:26:15 -07001398
1399 // 64 bytes at a time using xmm registers
1400hugeloop:
1401 CMPQ BX, $64
1402 JB bigloop
1403 MOVOU (SI), X0
1404 MOVOU (DI), X1
1405 MOVOU 16(SI), X2
1406 MOVOU 16(DI), X3
1407 MOVOU 32(SI), X4
1408 MOVOU 32(DI), X5
1409 MOVOU 48(SI), X6
1410 MOVOU 48(DI), X7
1411 PCMPEQB X1, X0
1412 PCMPEQB X3, X2
1413 PCMPEQB X5, X4
1414 PCMPEQB X7, X6
1415 PAND X2, X0
1416 PAND X6, X4
1417 PAND X4, X0
1418 PMOVMSKB X0, DX
1419 ADDQ $64, SI
1420 ADDQ $64, DI
1421 SUBQ $64, BX
1422 CMPL DX, $0xffff
1423 JEQ hugeloop
Keith Randallc526f3a2015-04-21 14:22:41 -07001424 MOVB $0, (AX)
Keith Randall3d5daa22013-04-02 16:26:15 -07001425 RET
1426
Ilya Tocar967564b2015-10-29 17:17:05 +03001427 // 64 bytes at a time using ymm registers
1428hugeloop_avx2:
1429 CMPQ BX, $64
1430 JB bigloop_avx2
Russ Cox8d881b82016-01-22 22:25:15 -05001431 VMOVDQU (SI), Y0
1432 VMOVDQU (DI), Y1
1433 VMOVDQU 32(SI), Y2
1434 VMOVDQU 32(DI), Y3
1435 VPCMPEQB Y1, Y0, Y4
1436 VPCMPEQB Y2, Y3, Y5
1437 VPAND Y4, Y5, Y6
1438 VPMOVMSKB Y6, DX
Ilya Tocar967564b2015-10-29 17:17:05 +03001439 ADDQ $64, SI
1440 ADDQ $64, DI
1441 SUBQ $64, BX
1442 CMPL DX, $0xffffffff
1443 JEQ hugeloop_avx2
1444 VZEROUPPER
1445 MOVB $0, (AX)
1446 RET
1447
1448bigloop_avx2:
1449 VZEROUPPER
1450
Keith Randall3d5daa22013-04-02 16:26:15 -07001451 // 8 bytes at a time using 64-bit register
1452bigloop:
1453 CMPQ BX, $8
1454 JBE leftover
1455 MOVQ (SI), CX
1456 MOVQ (DI), DX
1457 ADDQ $8, SI
1458 ADDQ $8, DI
1459 SUBQ $8, BX
1460 CMPQ CX, DX
1461 JEQ bigloop
Keith Randallc526f3a2015-04-21 14:22:41 -07001462 MOVB $0, (AX)
Keith Randall3d5daa22013-04-02 16:26:15 -07001463 RET
1464
1465 // remaining 0-8 bytes
1466leftover:
1467 MOVQ -8(SI)(BX*1), CX
1468 MOVQ -8(DI)(BX*1), DX
1469 CMPQ CX, DX
Keith Randallc526f3a2015-04-21 14:22:41 -07001470 SETEQ (AX)
Keith Randall3d5daa22013-04-02 16:26:15 -07001471 RET
1472
1473small:
1474 CMPQ BX, $0
1475 JEQ equal
1476
1477 LEAQ 0(BX*8), CX
1478 NEGQ CX
1479
1480 CMPB SI, $0xf8
1481 JA si_high
1482
1483 // load at SI won't cross a page boundary.
1484 MOVQ (SI), SI
1485 JMP si_finish
1486si_high:
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +00001487 // address ends in 11111xxx. Load up to bytes we want, move to correct position.
Keith Randall3d5daa22013-04-02 16:26:15 -07001488 MOVQ -8(SI)(BX*1), SI
1489 SHRQ CX, SI
1490si_finish:
1491
1492 // same for DI.
1493 CMPB DI, $0xf8
1494 JA di_high
1495 MOVQ (DI), DI
1496 JMP di_finish
1497di_high:
1498 MOVQ -8(DI)(BX*1), DI
1499 SHRQ CX, DI
1500di_finish:
1501
1502 SUBQ SI, DI
1503 SHLQ CX, DI
1504equal:
Keith Randallc526f3a2015-04-21 14:22:41 -07001505 SETEQ (AX)
Keith Randall3d5daa22013-04-02 16:26:15 -07001506 RET
Keith Randallb3946dc2013-05-14 16:05:51 -07001507
Keith Randall5a546962013-08-07 10:23:24 -07001508TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
Russ Cox25f6b022014-08-27 11:32:17 -04001509 MOVQ s1_base+0(FP), SI
1510 MOVQ s1_len+8(FP), BX
1511 MOVQ s2_base+16(FP), DI
1512 MOVQ s2_len+24(FP), DX
Keith Randallc526f3a2015-04-21 14:22:41 -07001513 LEAQ ret+32(FP), R9
1514 JMP runtime·cmpbody(SB)
Keith Randallb3946dc2013-05-14 16:05:51 -07001515
Russ Cox7a524a12014-12-22 13:27:53 -05001516TEXT bytes·Compare(SB),NOSPLIT,$0-56
Keith Randallb3946dc2013-05-14 16:05:51 -07001517 MOVQ s1+0(FP), SI
1518 MOVQ s1+8(FP), BX
1519 MOVQ s2+24(FP), DI
1520 MOVQ s2+32(FP), DX
Keith Randallc526f3a2015-04-21 14:22:41 -07001521 LEAQ res+48(FP), R9
1522 JMP runtime·cmpbody(SB)
Keith Randallb3946dc2013-05-14 16:05:51 -07001523
1524// input:
1525// SI = a
1526// DI = b
1527// BX = alen
1528// DX = blen
Keith Randallc526f3a2015-04-21 14:22:41 -07001529// R9 = address of output word (stores -1/0/1 here)
Keith Randall5a546962013-08-07 10:23:24 -07001530TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
Keith Randallb3946dc2013-05-14 16:05:51 -07001531 CMPQ SI, DI
Russ Coxb55791e2014-10-28 21:50:16 -04001532 JEQ allsame
Keith Randallb3946dc2013-05-14 16:05:51 -07001533 CMPQ BX, DX
Austin Clements20a6ff72015-01-27 18:29:02 -05001534 MOVQ DX, R8
1535 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
1536 CMPQ R8, $8
Russ Coxb55791e2014-10-28 21:50:16 -04001537 JB small
Keith Randallb3946dc2013-05-14 16:05:51 -07001538
Uttam C Pawar32add8d2015-07-02 11:43:46 -07001539 CMPQ R8, $63
Ilya Tocar0e23ca42015-10-28 23:20:26 +03001540 JBE loop
1541 CMPB runtime·support_avx2(SB), $1
1542 JEQ big_loop_avx2
1543 JMP big_loop
Russ Coxb55791e2014-10-28 21:50:16 -04001544loop:
Austin Clements20a6ff72015-01-27 18:29:02 -05001545 CMPQ R8, $16
Russ Coxb55791e2014-10-28 21:50:16 -04001546 JBE _0through16
Keith Randallb3946dc2013-05-14 16:05:51 -07001547 MOVOU (SI), X0
1548 MOVOU (DI), X1
1549 PCMPEQB X0, X1
1550 PMOVMSKB X1, AX
1551 XORQ $0xffff, AX // convert EQ to NE
Russ Coxb55791e2014-10-28 21:50:16 -04001552 JNE diff16 // branch if at least one byte is not equal
Keith Randallb3946dc2013-05-14 16:05:51 -07001553 ADDQ $16, SI
1554 ADDQ $16, DI
Austin Clements20a6ff72015-01-27 18:29:02 -05001555 SUBQ $16, R8
Russ Coxb55791e2014-10-28 21:50:16 -04001556 JMP loop
Keith Randallb3946dc2013-05-14 16:05:51 -07001557
Uttam C Pawar32add8d2015-07-02 11:43:46 -07001558diff64:
1559 ADDQ $48, SI
1560 ADDQ $48, DI
1561 JMP diff16
1562diff48:
1563 ADDQ $32, SI
1564 ADDQ $32, DI
1565 JMP diff16
1566diff32:
1567 ADDQ $16, SI
1568 ADDQ $16, DI
Keith Randallb3946dc2013-05-14 16:05:51 -07001569 // AX = bit mask of differences
Russ Coxb55791e2014-10-28 21:50:16 -04001570diff16:
Keith Randallb3946dc2013-05-14 16:05:51 -07001571 BSFQ AX, BX // index of first byte that differs
1572 XORQ AX, AX
1573 MOVB (SI)(BX*1), CX
1574 CMPB CX, (DI)(BX*1)
1575 SETHI AX
1576 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
Keith Randallc526f3a2015-04-21 14:22:41 -07001577 MOVQ AX, (R9)
Keith Randallb3946dc2013-05-14 16:05:51 -07001578 RET
1579
1580 // 0 through 16 bytes left, alen>=8, blen>=8
Russ Coxb55791e2014-10-28 21:50:16 -04001581_0through16:
Austin Clements20a6ff72015-01-27 18:29:02 -05001582 CMPQ R8, $8
Russ Coxb55791e2014-10-28 21:50:16 -04001583 JBE _0through8
Keith Randallb3946dc2013-05-14 16:05:51 -07001584 MOVQ (SI), AX
1585 MOVQ (DI), CX
1586 CMPQ AX, CX
Russ Coxb55791e2014-10-28 21:50:16 -04001587 JNE diff8
1588_0through8:
Austin Clements20a6ff72015-01-27 18:29:02 -05001589 MOVQ -8(SI)(R8*1), AX
1590 MOVQ -8(DI)(R8*1), CX
Keith Randallb3946dc2013-05-14 16:05:51 -07001591 CMPQ AX, CX
Russ Coxb55791e2014-10-28 21:50:16 -04001592 JEQ allsame
Keith Randallb3946dc2013-05-14 16:05:51 -07001593
1594 // AX and CX contain parts of a and b that differ.
Russ Coxb55791e2014-10-28 21:50:16 -04001595diff8:
Keith Randallb3946dc2013-05-14 16:05:51 -07001596 BSWAPQ AX // reverse order of bytes
1597 BSWAPQ CX
1598 XORQ AX, CX
1599 BSRQ CX, CX // index of highest bit difference
1600 SHRQ CX, AX // move a's bit to bottom
1601 ANDQ $1, AX // mask bit
1602 LEAQ -1(AX*2), AX // 1/0 => +1/-1
Keith Randallc526f3a2015-04-21 14:22:41 -07001603 MOVQ AX, (R9)
Keith Randallb3946dc2013-05-14 16:05:51 -07001604 RET
1605
1606 // 0-7 bytes in common
Russ Coxb55791e2014-10-28 21:50:16 -04001607small:
Austin Clements20a6ff72015-01-27 18:29:02 -05001608 LEAQ (R8*8), CX // bytes left -> bits left
Keith Randallb3946dc2013-05-14 16:05:51 -07001609 NEGQ CX // - bits lift (== 64 - bits left mod 64)
Russ Coxb55791e2014-10-28 21:50:16 -04001610 JEQ allsame
Keith Randallb3946dc2013-05-14 16:05:51 -07001611
1612 // load bytes of a into high bytes of AX
1613 CMPB SI, $0xf8
Russ Coxb55791e2014-10-28 21:50:16 -04001614 JA si_high
Keith Randallb3946dc2013-05-14 16:05:51 -07001615 MOVQ (SI), SI
Russ Coxb55791e2014-10-28 21:50:16 -04001616 JMP si_finish
1617si_high:
Austin Clements20a6ff72015-01-27 18:29:02 -05001618 MOVQ -8(SI)(R8*1), SI
Keith Randallb3946dc2013-05-14 16:05:51 -07001619 SHRQ CX, SI
Russ Coxb55791e2014-10-28 21:50:16 -04001620si_finish:
Keith Randallb3946dc2013-05-14 16:05:51 -07001621 SHLQ CX, SI
1622
1623 // load bytes of b in to high bytes of BX
1624 CMPB DI, $0xf8
Russ Coxb55791e2014-10-28 21:50:16 -04001625 JA di_high
Keith Randallb3946dc2013-05-14 16:05:51 -07001626 MOVQ (DI), DI
Russ Coxb55791e2014-10-28 21:50:16 -04001627 JMP di_finish
1628di_high:
Austin Clements20a6ff72015-01-27 18:29:02 -05001629 MOVQ -8(DI)(R8*1), DI
Keith Randallb3946dc2013-05-14 16:05:51 -07001630 SHRQ CX, DI
Russ Coxb55791e2014-10-28 21:50:16 -04001631di_finish:
Keith Randallb3946dc2013-05-14 16:05:51 -07001632 SHLQ CX, DI
1633
1634 BSWAPQ SI // reverse order of bytes
1635 BSWAPQ DI
1636 XORQ SI, DI // find bit differences
Russ Coxb55791e2014-10-28 21:50:16 -04001637 JEQ allsame
Keith Randallb3946dc2013-05-14 16:05:51 -07001638 BSRQ DI, CX // index of highest bit difference
1639 SHRQ CX, SI // move a's bit to bottom
1640 ANDQ $1, SI // mask bit
1641 LEAQ -1(SI*2), AX // 1/0 => +1/-1
Keith Randallc526f3a2015-04-21 14:22:41 -07001642 MOVQ AX, (R9)
Keith Randallb3946dc2013-05-14 16:05:51 -07001643 RET
1644
Russ Coxb55791e2014-10-28 21:50:16 -04001645allsame:
Keith Randallb3946dc2013-05-14 16:05:51 -07001646 XORQ AX, AX
1647 XORQ CX, CX
1648 CMPQ BX, DX
1649 SETGT AX // 1 if alen > blen
1650 SETEQ CX // 1 if alen == blen
1651 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
Keith Randallc526f3a2015-04-21 14:22:41 -07001652 MOVQ AX, (R9)
Keith Randallb3946dc2013-05-14 16:05:51 -07001653 RET
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07001654
Uttam C Pawar32add8d2015-07-02 11:43:46 -07001655 // this works for >= 64 bytes of data.
1656big_loop:
1657 MOVOU (SI), X0
1658 MOVOU (DI), X1
1659 PCMPEQB X0, X1
1660 PMOVMSKB X1, AX
1661 XORQ $0xffff, AX
1662 JNE diff16
1663
1664 MOVOU 16(SI), X0
1665 MOVOU 16(DI), X1
1666 PCMPEQB X0, X1
1667 PMOVMSKB X1, AX
1668 XORQ $0xffff, AX
1669 JNE diff32
1670
1671 MOVOU 32(SI), X0
1672 MOVOU 32(DI), X1
1673 PCMPEQB X0, X1
1674 PMOVMSKB X1, AX
1675 XORQ $0xffff, AX
1676 JNE diff48
1677
1678 MOVOU 48(SI), X0
1679 MOVOU 48(DI), X1
1680 PCMPEQB X0, X1
1681 PMOVMSKB X1, AX
1682 XORQ $0xffff, AX
1683 JNE diff64
1684
1685 ADDQ $64, SI
1686 ADDQ $64, DI
1687 SUBQ $64, R8
1688 CMPQ R8, $64
1689 JBE loop
1690 JMP big_loop
1691
Ilya Tocar0e23ca42015-10-28 23:20:26 +03001692 // Compare 64-bytes per loop iteration.
1693 // Loop is unrolled and uses AVX2.
1694big_loop_avx2:
Russ Cox8d881b82016-01-22 22:25:15 -05001695 VMOVDQU (SI), Y2
1696 VMOVDQU (DI), Y3
1697 VMOVDQU 32(SI), Y4
1698 VMOVDQU 32(DI), Y5
1699 VPCMPEQB Y2, Y3, Y0
1700 VPMOVMSKB Y0, AX
Ilya Tocar0e23ca42015-10-28 23:20:26 +03001701 XORL $0xffffffff, AX
1702 JNE diff32_avx2
Russ Cox8d881b82016-01-22 22:25:15 -05001703 VPCMPEQB Y4, Y5, Y6
1704 VPMOVMSKB Y6, AX
Ilya Tocar0e23ca42015-10-28 23:20:26 +03001705 XORL $0xffffffff, AX
1706 JNE diff64_avx2
1707
1708 ADDQ $64, SI
1709 ADDQ $64, DI
1710 SUBQ $64, R8
1711 CMPQ R8, $64
1712 JB big_loop_avx2_exit
1713 JMP big_loop_avx2
1714
1715 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
1716diff32_avx2:
1717 VZEROUPPER
1718 JMP diff16
1719
1720 // Same as diff32_avx2, but for last 32 bytes.
1721diff64_avx2:
1722 VZEROUPPER
1723 JMP diff48
1724
1725 // For <64 bytes remainder jump to normal loop.
1726big_loop_avx2_exit:
1727 VZEROUPPER
1728 JMP loop
1729
Ilya Tocar95333ae2015-10-28 18:05:05 +03001730TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
1731 MOVQ s+0(FP), DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001732 // We want len in DX and AX, because PCMPESTRI implicitly consumes them
1733 MOVQ s_len+8(FP), DX
1734 MOVQ c+16(FP), BP
1735 MOVQ c_len+24(FP), AX
Ilya Tocar44f18542016-04-28 17:34:24 +03001736 MOVQ DI, R10
1737 LEAQ ret+32(FP), R11
1738 JMP runtime·indexShortStr(SB)
1739
1740TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
1741 MOVQ s+0(FP), DI
1742 MOVQ s_len+8(FP), DX
1743 MOVQ c+24(FP), BP
1744 MOVQ c_len+32(FP), AX
1745 MOVQ DI, R10
1746 LEAQ ret+48(FP), R11
1747 JMP runtime·indexShortStr(SB)
1748
1749// AX: length of string, that we are searching for
1750// DX: length of string, in which we are searching
1751// DI: pointer to string, in which we are searching
1752// BP: pointer to string, that we are searching for
1753// R11: address, where to put return value
1754TEXT runtime·indexShortStr(SB),NOSPLIT,$0
Ilya Tocar6b02a192016-04-21 18:24:12 +03001755 CMPQ AX, DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001756 JA fail
Ilya Tocar6b02a192016-04-21 18:24:12 +03001757 CMPQ DX, $16
1758 JAE sse42
1759no_sse42:
1760 CMPQ AX, $2
Ilya Tocar95333ae2015-10-28 18:05:05 +03001761 JA _3_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001762 MOVW (BP), BP
1763 LEAQ -1(DI)(DX*1), DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001764loop2:
1765 MOVW (DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001766 CMPW SI,BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001767 JZ success
1768 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001769 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001770 JB loop2
1771 JMP fail
1772_3_or_more:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001773 CMPQ AX, $3
Ilya Tocar95333ae2015-10-28 18:05:05 +03001774 JA _4_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001775 MOVW 1(BP), BX
1776 MOVW (BP), BP
1777 LEAQ -2(DI)(DX*1), DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001778loop3:
1779 MOVW (DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001780 CMPW SI,BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001781 JZ partial_success3
1782 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001783 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001784 JB loop3
1785 JMP fail
1786partial_success3:
1787 MOVW 1(DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001788 CMPW SI,BX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001789 JZ success
1790 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001791 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001792 JB loop3
1793 JMP fail
1794_4_or_more:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001795 CMPQ AX, $4
Ilya Tocar95333ae2015-10-28 18:05:05 +03001796 JA _5_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001797 MOVL (BP), BP
1798 LEAQ -3(DI)(DX*1), DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001799loop4:
1800 MOVL (DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001801 CMPL SI,BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001802 JZ success
1803 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001804 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001805 JB loop4
1806 JMP fail
1807_5_or_more:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001808 CMPQ AX, $7
Ilya Tocar95333ae2015-10-28 18:05:05 +03001809 JA _8_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001810 LEAQ 1(DI)(DX*1), DX
1811 SUBQ AX, DX
1812 MOVL -4(BP)(AX*1), BX
1813 MOVL (BP), BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001814loop5to7:
1815 MOVL (DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001816 CMPL SI,BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001817 JZ partial_success5to7
1818 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001819 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001820 JB loop5to7
1821 JMP fail
1822partial_success5to7:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001823 MOVL -4(AX)(DI*1), SI
1824 CMPL SI,BX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001825 JZ success
1826 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001827 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001828 JB loop5to7
1829 JMP fail
1830_8_or_more:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001831 CMPQ AX, $8
Ilya Tocar95333ae2015-10-28 18:05:05 +03001832 JA _9_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001833 MOVQ (BP), BP
1834 LEAQ -7(DI)(DX*1), DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001835loop8:
1836 MOVQ (DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001837 CMPQ SI,BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001838 JZ success
1839 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001840 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001841 JB loop8
1842 JMP fail
1843_9_or_more:
Ilya Tocar0cff2192016-04-28 17:39:55 +03001844 CMPQ AX, $15
Ilya Tocar95333ae2015-10-28 18:05:05 +03001845 JA _16_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001846 LEAQ 1(DI)(DX*1), DX
1847 SUBQ AX, DX
1848 MOVQ -8(BP)(AX*1), BX
1849 MOVQ (BP), BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001850loop9to15:
1851 MOVQ (DI), SI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001852 CMPQ SI,BP
Ilya Tocar95333ae2015-10-28 18:05:05 +03001853 JZ partial_success9to15
1854 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001855 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001856 JB loop9to15
1857 JMP fail
1858partial_success9to15:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001859 MOVQ -8(AX)(DI*1), SI
1860 CMPQ SI,BX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001861 JZ success
1862 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001863 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001864 JB loop9to15
1865 JMP fail
1866_16_or_more:
Ilya Tocar429bbf32016-05-25 16:33:19 +03001867 CMPQ AX, $16
Ilya Tocar0cff2192016-04-28 17:39:55 +03001868 JA _17_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001869 MOVOU (BP), X1
1870 LEAQ -15(DI)(DX*1), DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001871loop16:
1872 MOVOU (DI), X2
1873 PCMPEQB X1, X2
1874 PMOVMSKB X2, SI
1875 CMPQ SI, $0xffff
1876 JE success
1877 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001878 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001879 JB loop16
1880 JMP fail
Ilya Tocar0cff2192016-04-28 17:39:55 +03001881_17_or_more:
1882 CMPQ AX, $31
1883 JA _32_or_more
Ilya Tocar6b02a192016-04-21 18:24:12 +03001884 LEAQ 1(DI)(DX*1), DX
1885 SUBQ AX, DX
1886 MOVOU -16(BP)(AX*1), X0
1887 MOVOU (BP), X1
Ilya Tocar95333ae2015-10-28 18:05:05 +03001888loop17to31:
1889 MOVOU (DI), X2
1890 PCMPEQB X1,X2
1891 PMOVMSKB X2, SI
1892 CMPQ SI, $0xffff
1893 JE partial_success17to31
1894 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001895 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001896 JB loop17to31
1897 JMP fail
1898partial_success17to31:
Ilya Tocar6b02a192016-04-21 18:24:12 +03001899 MOVOU -16(AX)(DI*1), X3
Ilya Tocar95333ae2015-10-28 18:05:05 +03001900 PCMPEQB X0, X3
1901 PMOVMSKB X3, SI
1902 CMPQ SI, $0xffff
1903 JE success
1904 ADDQ $1,DI
Ilya Tocar6b02a192016-04-21 18:24:12 +03001905 CMPQ DI,DX
Ilya Tocar95333ae2015-10-28 18:05:05 +03001906 JB loop17to31
Ilya Tocar0cff2192016-04-28 17:39:55 +03001907 JMP fail
1908// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
1909// So no need to check cpuid
1910_32_or_more:
1911 CMPQ AX, $32
1912 JA _33_to_63
1913 VMOVDQU (BP), Y1
1914 LEAQ -31(DI)(DX*1), DX
1915loop32:
1916 VMOVDQU (DI), Y2
1917 VPCMPEQB Y1, Y2, Y3
1918 VPMOVMSKB Y3, SI
1919 CMPL SI, $0xffffffff
1920 JE success_avx2
1921 ADDQ $1,DI
1922 CMPQ DI,DX
1923 JB loop32
1924 JMP fail_avx2
1925_33_to_63:
1926 LEAQ 1(DI)(DX*1), DX
1927 SUBQ AX, DX
1928 VMOVDQU -32(BP)(AX*1), Y0
1929 VMOVDQU (BP), Y1
1930loop33to63:
1931 VMOVDQU (DI), Y2
1932 VPCMPEQB Y1, Y2, Y3
1933 VPMOVMSKB Y3, SI
1934 CMPL SI, $0xffffffff
1935 JE partial_success33to63
1936 ADDQ $1,DI
1937 CMPQ DI,DX
1938 JB loop33to63
1939 JMP fail_avx2
1940partial_success33to63:
1941 VMOVDQU -32(AX)(DI*1), Y3
1942 VPCMPEQB Y0, Y3, Y4
1943 VPMOVMSKB Y4, SI
1944 CMPL SI, $0xffffffff
1945 JE success_avx2
1946 ADDQ $1,DI
1947 CMPQ DI,DX
1948 JB loop33to63
1949fail_avx2:
1950 VZEROUPPER
Ilya Tocar95333ae2015-10-28 18:05:05 +03001951fail:
Ilya Tocar44f18542016-04-28 17:34:24 +03001952 MOVQ $-1, (R11)
Ilya Tocar95333ae2015-10-28 18:05:05 +03001953 RET
Ilya Tocar0cff2192016-04-28 17:39:55 +03001954success_avx2:
1955 VZEROUPPER
1956 JMP success
Ilya Tocar6b02a192016-04-21 18:24:12 +03001957sse42:
Martin Möhrmann5a6c5802017-04-27 08:30:27 +02001958 CMPB runtime·support_sse42(SB), $1
1959 JNE no_sse42
Ilya Tocar6b02a192016-04-21 18:24:12 +03001960 CMPQ AX, $12
1961 // PCMPESTRI is slower than normal compare,
1962 // so using it makes sense only if we advance 4+ bytes per compare
1963 // This value was determined experimentally and is the ~same
1964 // on Nehalem (first with SSE42) and Haswell.
1965 JAE _9_or_more
1966 LEAQ 16(BP), SI
1967 TESTW $0xff0, SI
1968 JEQ no_sse42
1969 MOVOU (BP), X1
1970 LEAQ -15(DI)(DX*1), SI
1971 MOVQ $16, R9
1972 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
1973loop_sse42:
1974 // 0x0c means: unsigned byte compare (bits 0,1 are 00)
1975 // for equality (bits 2,3 are 11)
1976 // result is not masked or inverted (bits 4,5 are 00)
1977 // and corresponds to first matching byte (bit 6 is 0)
1978 PCMPESTRI $0x0c, (DI), X1
1979 // CX == 16 means no match,
1980 // CX > R9 means partial match at the end of the string,
1981 // otherwise sep is at offset CX from X1 start
1982 CMPQ CX, R9
1983 JBE sse42_success
1984 ADDQ R9, DI
1985 CMPQ DI, SI
1986 JB loop_sse42
1987 PCMPESTRI $0x0c, -1(SI), X1
1988 CMPQ CX, R9
1989 JA fail
1990 LEAQ -1(SI), DI
1991sse42_success:
1992 ADDQ CX, DI
Ilya Tocar95333ae2015-10-28 18:05:05 +03001993success:
Ilya Tocar44f18542016-04-28 17:34:24 +03001994 SUBQ R10, DI
1995 MOVQ DI, (R11)
Ilya Tocar95333ae2015-10-28 18:05:05 +03001996 RET
1997
1998
Shenghou Ma3b001972015-03-07 00:18:16 -05001999TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002000 MOVQ s+0(FP), SI
2001 MOVQ s_len+8(FP), BX
2002 MOVB c+24(FP), AL
Keith Randallc526f3a2015-04-21 14:22:41 -07002003 LEAQ ret+32(FP), R8
2004 JMP runtime·indexbytebody(SB)
Brad Fitzpatrick598c7892013-08-05 15:04:05 -07002005
Shenghou Ma3b001972015-03-07 00:18:16 -05002006TEXT strings·IndexByte(SB),NOSPLIT,$0-32
Brad Fitzpatrick598c7892013-08-05 15:04:05 -07002007 MOVQ s+0(FP), SI
2008 MOVQ s_len+8(FP), BX
2009 MOVB c+16(FP), AL
Keith Randallc526f3a2015-04-21 14:22:41 -07002010 LEAQ ret+24(FP), R8
2011 JMP runtime·indexbytebody(SB)
Brad Fitzpatrick598c7892013-08-05 15:04:05 -07002012
2013// input:
2014// SI: data
2015// BX: data len
2016// AL: byte sought
Keith Randallc526f3a2015-04-21 14:22:41 -07002017// R8: address to put result
Keith Randall5a546962013-08-07 10:23:24 -07002018TEXT runtime·indexbytebody(SB),NOSPLIT,$0
Keith Randall687abca2016-01-15 18:17:09 -08002019 // Shuffle X0 around so that each byte contains
2020 // the character we're looking for.
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002021 MOVD AX, X0
2022 PUNPCKLBW X0, X0
2023 PUNPCKLBW X0, X0
2024 PSHUFL $0, X0, X0
Keith Randall687abca2016-01-15 18:17:09 -08002025
2026 CMPQ BX, $16
2027 JLT small
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002028
Keith Randall687abca2016-01-15 18:17:09 -08002029 MOVQ SI, DI
2030
2031 CMPQ BX, $32
2032 JA avx2
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002033sse:
Keith Randall687abca2016-01-15 18:17:09 -08002034 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
2035 JMP sseloopentry
2036
2037sseloop:
2038 // Move the next 16-byte chunk of the data into X1.
2039 MOVOU (DI), X1
2040 // Compare bytes in X0 to X1.
2041 PCMPEQB X0, X1
2042 // Take the top bit of each byte in X1 and put the result in DX.
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002043 PMOVMSKB X1, DX
Keith Randall687abca2016-01-15 18:17:09 -08002044 // Find first set bit, if any.
2045 BSFL DX, DX
2046 JNZ ssesuccess
2047 // Advance to next block.
2048 ADDQ $16, DI
2049sseloopentry:
2050 CMPQ DI, AX
2051 JB sseloop
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002052
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +00002053 // Search the last 16-byte chunk. This chunk may overlap with the
Keith Randall687abca2016-01-15 18:17:09 -08002054 // chunks we've already searched, but that's ok.
2055 MOVQ AX, DI
2056 MOVOU (AX), X1
2057 PCMPEQB X0, X1
2058 PMOVMSKB X1, DX
2059 BSFL DX, DX
2060 JNZ ssesuccess
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002061
2062failure:
Keith Randallc526f3a2015-04-21 14:22:41 -07002063 MOVQ $-1, (R8)
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002064 RET
2065
Keith Randall687abca2016-01-15 18:17:09 -08002066// We've found a chunk containing the byte.
2067// The chunk was loaded from DI.
2068// The index of the matching byte in the chunk is DX.
2069// The start of the data is SI.
2070ssesuccess:
2071 SUBQ SI, DI // Compute offset of chunk within data.
2072 ADDQ DX, DI // Add offset of byte within chunk.
2073 MOVQ DI, (R8)
2074 RET
2075
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002076// handle for lengths < 16
Russ Coxb55791e2014-10-28 21:50:16 -04002077small:
Keith Randall687abca2016-01-15 18:17:09 -08002078 TESTQ BX, BX
2079 JEQ failure
2080
2081 // Check if we'll load across a page boundary.
2082 LEAQ 16(SI), AX
2083 TESTW $0xff0, AX
2084 JEQ endofpage
2085
2086 MOVOU (SI), X1 // Load data
2087 PCMPEQB X0, X1 // Compare target byte with each byte in data.
2088 PMOVMSKB X1, DX // Move result bits to integer register.
2089 BSFL DX, DX // Find first set bit.
2090 JZ failure // No set bit, failure.
2091 CMPL DX, BX
2092 JAE failure // Match is past end of data.
2093 MOVQ DX, (R8)
2094 RET
2095
2096endofpage:
2097 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
2098 PCMPEQB X0, X1 // Compare target byte with each byte in data.
2099 PMOVMSKB X1, DX // Move result bits to integer register.
2100 MOVL BX, CX
2101 SHLL CX, DX
2102 SHRL $16, DX // Shift desired bits down to bottom of register.
2103 BSFL DX, DX // Find first set bit.
2104 JZ failure // No set bit, failure.
2105 MOVQ DX, (R8)
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002106 RET
2107
Ilya Tocar321a4072015-10-29 18:52:22 +03002108avx2:
2109 CMPB runtime·support_avx2(SB), $1
Keith Randall687abca2016-01-15 18:17:09 -08002110 JNE sse
Ilya Tocar321a4072015-10-29 18:52:22 +03002111 MOVD AX, X0
2112 LEAQ -32(SI)(BX*1), R11
Russ Cox8d881b82016-01-22 22:25:15 -05002113 VPBROADCASTB X0, Y1
Ilya Tocar321a4072015-10-29 18:52:22 +03002114avx2_loop:
Russ Cox8d881b82016-01-22 22:25:15 -05002115 VMOVDQU (DI), Y2
2116 VPCMPEQB Y1, Y2, Y3
2117 VPTEST Y3, Y3
Ilya Tocar321a4072015-10-29 18:52:22 +03002118 JNZ avx2success
2119 ADDQ $32, DI
2120 CMPQ DI, R11
2121 JLT avx2_loop
2122 MOVQ R11, DI
Russ Cox8d881b82016-01-22 22:25:15 -05002123 VMOVDQU (DI), Y2
2124 VPCMPEQB Y1, Y2, Y3
2125 VPTEST Y3, Y3
Ilya Tocar321a4072015-10-29 18:52:22 +03002126 JNZ avx2success
2127 VZEROUPPER
2128 MOVQ $-1, (R8)
2129 RET
2130
2131avx2success:
Russ Cox8d881b82016-01-22 22:25:15 -05002132 VPMOVMSKB Y3, DX
Ilya Tocar321a4072015-10-29 18:52:22 +03002133 BSFL DX, DX
2134 SUBQ SI, DI
2135 ADDQ DI, DX
2136 MOVQ DX, (R8)
2137 VZEROUPPER
2138 RET
2139
Keith Randall5a546962013-08-07 10:23:24 -07002140TEXT bytes·Equal(SB),NOSPLIT,$0-49
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002141 MOVQ a_len+8(FP), BX
2142 MOVQ b_len+32(FP), CX
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002143 CMPQ BX, CX
2144 JNE eqret
2145 MOVQ a+0(FP), SI
2146 MOVQ b+24(FP), DI
Keith Randallc526f3a2015-04-21 14:22:41 -07002147 LEAQ ret+48(FP), AX
2148 JMP runtime·memeqbody(SB)
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002149eqret:
Keith Randallc526f3a2015-04-21 14:22:41 -07002150 MOVB $0, ret+48(FP)
Brad Fitzpatricke2a1bd62013-08-01 16:11:19 -07002151 RET
Keith Randall6c7cbf02014-04-01 12:51:02 -07002152
Josselin Costanzid206af12017-03-27 13:22:59 +02002153
2154TEXT bytes·countByte(SB),NOSPLIT,$0-40
2155 MOVQ s+0(FP), SI
2156 MOVQ s_len+8(FP), BX
2157 MOVB c+24(FP), AL
2158 LEAQ ret+32(FP), R8
2159 JMP runtime·countByte(SB)
2160
2161TEXT strings·countByte(SB),NOSPLIT,$0-32
2162 MOVQ s+0(FP), SI
2163 MOVQ s_len+8(FP), BX
2164 MOVB c+16(FP), AL
2165 LEAQ ret+24(FP), R8
2166 JMP runtime·countByte(SB)
2167
2168// input:
2169// SI: data
2170// BX: data len
2171// AL: byte sought
2172// R8: address to put result
2173// This requires the POPCNT instruction
2174TEXT runtime·countByte(SB),NOSPLIT,$0
2175 // Shuffle X0 around so that each byte contains
2176 // the character we're looking for.
2177 MOVD AX, X0
2178 PUNPCKLBW X0, X0
2179 PUNPCKLBW X0, X0
2180 PSHUFL $0, X0, X0
2181
2182 CMPQ BX, $16
2183 JLT small
2184
2185 MOVQ $0, R12 // Accumulator
2186
2187 MOVQ SI, DI
2188
2189 CMPQ BX, $32
2190 JA avx2
2191sse:
2192 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
2193 JMP sseloopentry
2194
2195sseloop:
2196 // Move the next 16-byte chunk of the data into X1.
2197 MOVOU (DI), X1
2198 // Compare bytes in X0 to X1.
2199 PCMPEQB X0, X1
2200 // Take the top bit of each byte in X1 and put the result in DX.
2201 PMOVMSKB X1, DX
2202 // Count number of matching bytes
2203 POPCNTL DX, DX
2204 // Accumulate into R12
2205 ADDQ DX, R12
2206 // Advance to next block.
2207 ADDQ $16, DI
2208sseloopentry:
2209 CMPQ DI, AX
2210 JBE sseloop
2211
2212 // Get the number of bytes to consider in the last 16 bytes
2213 ANDQ $15, BX
2214 JZ end
2215
2216 // Create mask to ignore overlap between previous 16 byte block
2217 // and the next.
2218 MOVQ $16,CX
2219 SUBQ BX, CX
2220 MOVQ $0xFFFF, R10
2221 SARQ CL, R10
2222 SALQ CL, R10
2223
2224 // Process the last 16-byte chunk. This chunk may overlap with the
2225 // chunks we've already searched so we need to mask part of it.
2226 MOVOU (AX), X1
2227 PCMPEQB X0, X1
2228 PMOVMSKB X1, DX
2229 // Apply mask
2230 ANDQ R10, DX
2231 POPCNTL DX, DX
2232 ADDQ DX, R12
2233end:
2234 MOVQ R12, (R8)
2235 RET
2236
2237// handle for lengths < 16
2238small:
2239 TESTQ BX, BX
2240 JEQ endzero
2241
2242 // Check if we'll load across a page boundary.
2243 LEAQ 16(SI), AX
2244 TESTW $0xff0, AX
2245 JEQ endofpage
2246
2247 // We must ignore high bytes as they aren't part of our slice.
2248 // Create mask.
2249 MOVB BX, CX
2250 MOVQ $1, R10
2251 SALQ CL, R10
2252 SUBQ $1, R10
2253
2254 // Load data
2255 MOVOU (SI), X1
2256 // Compare target byte with each byte in data.
2257 PCMPEQB X0, X1
2258 // Move result bits to integer register.
2259 PMOVMSKB X1, DX
2260 // Apply mask
2261 ANDQ R10, DX
2262 POPCNTL DX, DX
2263 // Directly return DX, we don't need to accumulate
2264 // since we have <16 bytes.
2265 MOVQ DX, (R8)
2266 RET
2267endzero:
2268 MOVQ $0, (R8)
2269 RET
2270
2271endofpage:
2272 // We must ignore low bytes as they aren't part of our slice.
2273 MOVQ $16,CX
2274 SUBQ BX, CX
2275 MOVQ $0xFFFF, R10
2276 SARQ CL, R10
2277 SALQ CL, R10
2278
2279 // Load data into the high end of X1.
2280 MOVOU -16(SI)(BX*1), X1
2281 // Compare target byte with each byte in data.
2282 PCMPEQB X0, X1
2283 // Move result bits to integer register.
2284 PMOVMSKB X1, DX
2285 // Apply mask
2286 ANDQ R10, DX
2287 // Directly return DX, we don't need to accumulate
2288 // since we have <16 bytes.
2289 POPCNTL DX, DX
2290 MOVQ DX, (R8)
2291 RET
2292
2293avx2:
2294 CMPB runtime·support_avx2(SB), $1
2295 JNE sse
2296 MOVD AX, X0
2297 LEAQ -32(SI)(BX*1), R11
2298 VPBROADCASTB X0, Y1
2299avx2_loop:
2300 VMOVDQU (DI), Y2
2301 VPCMPEQB Y1, Y2, Y3
2302 VPMOVMSKB Y3, DX
2303 POPCNTL DX, DX
2304 ADDQ DX, R12
2305 ADDQ $32, DI
2306 CMPQ DI, R11
2307 JLE avx2_loop
2308
2309 // If last block is already processed,
2310 // skip to the end.
2311 CMPQ DI, R11
2312 JEQ endavx
2313
2314 // Load address of the last 32 bytes.
2315 // There is an overlap with the previous block.
2316 MOVQ R11, DI
2317 VMOVDQU (DI), Y2
2318 VPCMPEQB Y1, Y2, Y3
2319 VPMOVMSKB Y3, DX
2320 // Exit AVX mode.
2321 VZEROUPPER
2322
2323 // Create mask to ignore overlap between previous 32 byte block
2324 // and the next.
2325 ANDQ $31, BX
2326 MOVQ $32,CX
2327 SUBQ BX, CX
2328 MOVQ $0xFFFFFFFF, R10
2329 SARQ CL, R10
2330 SALQ CL, R10
2331 // Apply mask
2332 ANDQ R10, DX
2333 POPCNTL DX, DX
2334 ADDQ DX, R12
2335 MOVQ R12, (R8)
2336 RET
2337endavx:
2338 // Exit AVX mode.
2339 VZEROUPPER
2340 MOVQ R12, (R8)
2341 RET
2342
Keith Randallf4407372014-09-03 08:49:43 -07002343TEXT runtime·return0(SB), NOSPLIT, $0
2344 MOVL $0, AX
2345 RET
Keith Randall1b6807b2014-09-25 07:59:01 -07002346
2347
2348// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
2349// Must obey the gcc calling convention.
Keith Randall1aa65fe2014-09-25 08:37:04 -07002350TEXT _cgo_topofstack(SB),NOSPLIT,$0
Keith Randall1b6807b2014-09-25 07:59:01 -07002351 get_tls(CX)
2352 MOVQ g(CX), AX
2353 MOVQ g_m(AX), AX
2354 MOVQ m_curg(AX), AX
2355 MOVQ (g_stack+stack_hi)(AX), AX
2356 RET
Russ Coxa5a07332014-10-29 20:37:44 -04002357
2358// The top-most function running on a goroutine
2359// returns to goexit+PCQuantum.
2360TEXT runtime·goexit(SB),NOSPLIT,$0-0
2361 BYTE $0x90 // NOP
2362 CALL runtime·goexit1(SB) // does not return
Dmitry Vyukov894024f2015-02-20 20:07:02 +03002363 // traceback from goexit1 must hit code range of goexit
2364 BYTE $0x90 // NOP
Russ Cox15ced2d2014-11-11 17:06:22 -05002365
Michael Hudson-Doylef616af22015-04-01 14:17:43 +13002366// This is called from .init_array and follows the platform, not Go, ABI.
Michael Hudson-Doylebe0cb922015-05-12 11:59:14 +12002367TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
2368 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save
Michael Hudson-Doylef616af22015-04-01 14:17:43 +13002369 MOVQ runtime·lastmoduledatap(SB), AX
2370 MOVQ DI, moduledata_next(AX)
2371 MOVQ DI, runtime·lastmoduledatap(SB)
Michael Hudson-Doylebe0cb922015-05-12 11:59:14 +12002372 POPQ R15
Michael Hudson-Doylef616af22015-04-01 14:17:43 +13002373 RET
Austin Clementse9079a62017-10-26 12:21:16 -04002374
2375// gcWriteBarrier performs a heap pointer write and informs the GC.
2376//
2377// gcWriteBarrier does NOT follow the Go ABI. It takes two arguments:
2378// - DI is the destination of the write
2379// - AX is the value being written at DI
2380// It clobbers FLAGS. It does not clobber any general-purpose registers,
2381// but may clobber others (e.g., SSE registers).
Austin Clementse9079a62017-10-26 12:21:16 -04002382TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$120
Austin Clementsbf9ad702017-11-15 14:43:05 -08002383 // Save the registers clobbered by the fast path. This is slightly
2384 // faster than having the caller spill these.
Austin Clementse9079a62017-10-26 12:21:16 -04002385 MOVQ R14, 104(SP)
2386 MOVQ R13, 112(SP)
2387 // TODO: Consider passing g.m.p in as an argument so they can be shared
2388 // across a sequence of write barriers.
2389 get_tls(R13)
2390 MOVQ g(R13), R13
2391 MOVQ g_m(R13), R13
2392 MOVQ m_p(R13), R13
2393 MOVQ (p_wbBuf+wbBuf_next)(R13), R14
2394 // Increment wbBuf.next position.
2395 LEAQ 16(R14), R14
2396 MOVQ R14, (p_wbBuf+wbBuf_next)(R13)
2397 CMPQ R14, (p_wbBuf+wbBuf_end)(R13)
2398 // Record the write.
2399 MOVQ AX, -16(R14) // Record value
Austin Clements20101892018-01-15 12:27:17 -05002400 // Note: This turns bad pointer writes into bad
2401 // pointer reads, which could be confusing. We could avoid
2402 // reading from obviously bad pointers, which would
2403 // take care of the vast majority of these. We could
2404 // patch this up in the signal handler, or use XCHG to
2405 // combine the read and the write.
2406 MOVQ (DI), R13
Austin Clementse9079a62017-10-26 12:21:16 -04002407 MOVQ R13, -8(R14) // Record *slot
2408 // Is the buffer full? (flags set in CMPQ above)
2409 JEQ flush
2410ret:
2411 MOVQ 104(SP), R14
2412 MOVQ 112(SP), R13
2413 // Do the write.
2414 MOVQ AX, (DI)
2415 RET
2416
2417flush:
2418 // Save all general purpose registers since these could be
2419 // clobbered by wbBufFlush and were not saved by the caller.
2420 // It is possible for wbBufFlush to clobber other registers
2421 // (e.g., SSE registers), but the compiler takes care of saving
2422 // those in the caller if necessary. This strikes a balance
2423 // with registers that are likely to be used.
2424 //
2425 // We don't have type information for these, but all code under
2426 // here is NOSPLIT, so nothing will observe these.
2427 //
2428 // TODO: We could strike a different balance; e.g., saving X0
2429 // and not saving GP registers that are less likely to be used.
2430 MOVQ DI, 0(SP) // Also first argument to wbBufFlush
2431 MOVQ AX, 8(SP) // Also second argument to wbBufFlush
2432 MOVQ BX, 16(SP)
2433 MOVQ CX, 24(SP)
2434 MOVQ DX, 32(SP)
2435 // DI already saved
2436 MOVQ SI, 40(SP)
2437 MOVQ BP, 48(SP)
2438 MOVQ R8, 56(SP)
2439 MOVQ R9, 64(SP)
2440 MOVQ R10, 72(SP)
2441 MOVQ R11, 80(SP)
2442 MOVQ R12, 88(SP)
2443 // R13 already saved
2444 // R14 already saved
2445 MOVQ R15, 96(SP)
2446
2447 // This takes arguments DI and AX
2448 CALL runtime·wbBufFlush(SB)
2449
2450 MOVQ 0(SP), DI
2451 MOVQ 8(SP), AX
2452 MOVQ 16(SP), BX
2453 MOVQ 24(SP), CX
2454 MOVQ 32(SP), DX
2455 MOVQ 40(SP), SI
2456 MOVQ 48(SP), BP
2457 MOVQ 56(SP), R8
2458 MOVQ 64(SP), R9
2459 MOVQ 72(SP), R10
2460 MOVQ 80(SP), R11
2461 MOVQ 88(SP), R12
2462 MOVQ 96(SP), R15
2463 JMP ret