src/runtime/memclr_amd64.s - go - Git at Google

 // Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 //go:build !plan9

 #include "go_asm.h"
 #include "textflag.h"
 #include "asm_amd64.h"

 // See memclrNoHeapPointers Go doc for important implementation constraints.

 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
 // ABIInternal for performance.
 TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
 	// AX = ptr
 	// BX = n
 	MOVQ	AX, DI	// DI = ptr
 	XORQ	AX, AX

 	// MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
 tail:
 	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
 	TESTQ	BX, BX
 	JEQ	_0
 	CMPQ	BX, $2
 	JBE	_1or2
 	CMPQ	BX, $4
 	JBE	_3or4
 	CMPQ	BX, $8
 	JB	_5through7
 	JE	_8
 	CMPQ	BX, $16
 	JBE	_9through16
 	CMPQ	BX, $32
 	JBE	_17through32
 	CMPQ	BX, $64
 	JBE	_33through64
 	CMPQ	BX, $128
 	JBE	_65through128
 	CMPQ	BX, $256
 	JBE	_129through256

 	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
 	JNE	skip_erms

 	// If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
 	// Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
 	// in the Intel Optimization Guide shows better performance for ERMSB starting
 	// from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
 	CMPQ    BX, $2048
 	JAE	loop_preheader_erms

 skip_erms:
 #ifndef hasAVX2
 	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
 	JE	loop_preheader_avx2
 	// TODO: for really big clears, use MOVNTDQ, even without AVX2.

 loop:
 	MOVOU	X15, 0(DI)
 	MOVOU	X15, 16(DI)
 	MOVOU	X15, 32(DI)
 	MOVOU	X15, 48(DI)
 	MOVOU	X15, 64(DI)
 	MOVOU	X15, 80(DI)
 	MOVOU	X15, 96(DI)
 	MOVOU	X15, 112(DI)
 	MOVOU	X15, 128(DI)
 	MOVOU	X15, 144(DI)
 	MOVOU	X15, 160(DI)
 	MOVOU	X15, 176(DI)
 	MOVOU	X15, 192(DI)
 	MOVOU	X15, 208(DI)
 	MOVOU	X15, 224(DI)
 	MOVOU	X15, 240(DI)
 	SUBQ	$256, BX
 	ADDQ	$256, DI
 	CMPQ	BX, $256
 	JAE	loop
 	JMP	tail
 #endif

 loop_preheader_avx2:
 	VPXOR X0, X0, X0
 	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
 	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
 	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
 	CMPQ    BX, $0x2000000
 	JAE	loop_preheader_avx2_huge

 loop_avx2:
 	VMOVDQU	Y0, 0(DI)
 	VMOVDQU	Y0, 32(DI)
 	VMOVDQU	Y0, 64(DI)
 	VMOVDQU	Y0, 96(DI)
 	SUBQ	$128, BX
 	ADDQ	$128, DI
 	CMPQ	BX, $128
 	JAE	loop_avx2
 	VMOVDQU  Y0, -32(DI)(BX*1)
 	VMOVDQU  Y0, -64(DI)(BX*1)
 	VMOVDQU  Y0, -96(DI)(BX*1)
 	VMOVDQU  Y0, -128(DI)(BX*1)
 	VZEROUPPER
 	RET

 loop_preheader_erms:
 #ifndef hasAVX2
 	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
 	JNE	loop_erms
 #endif

 	VPXOR X0, X0, X0
 	// At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
 	// write protocol, ERMS could show the same or slower performance comparing to
 	// Non-Temporal Stores when the size is bigger than LLC depending on hardware.
 	CMPQ	BX, $0x2000000
 	JAE	loop_preheader_avx2_huge

 loop_erms:
 	// STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
 	// for a memory subsystem as the GC requires this.
 	MOVQ	BX, CX
 	SHRQ	$3, CX
 	ANDQ	$7, BX
 	REP;	STOSQ
 	JMP	tail

 loop_preheader_avx2_huge:
 	// Align to 32 byte boundary
 	VMOVDQU  Y0, 0(DI)
 	MOVQ	DI, SI
 	ADDQ	$32, DI
 	ANDQ	$~31, DI
 	SUBQ	DI, SI
 	ADDQ	SI, BX
 loop_avx2_huge:
 	VMOVNTDQ	Y0, 0(DI)
 	VMOVNTDQ	Y0, 32(DI)
 	VMOVNTDQ	Y0, 64(DI)
 	VMOVNTDQ	Y0, 96(DI)
 	SUBQ	$128, BX
 	ADDQ	$128, DI
 	CMPQ	BX, $128
 	JAE	loop_avx2_huge
 	// In the description of MOVNTDQ in [1]
 	// "... fencing operation implemented with the SFENCE or MFENCE instruction
 	// should be used in conjunction with MOVNTDQ instructions..."
 	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
 	SFENCE
 	VMOVDQU  Y0, -32(DI)(BX*1)
 	VMOVDQU  Y0, -64(DI)(BX*1)
 	VMOVDQU  Y0, -96(DI)(BX*1)
 	VMOVDQU  Y0, -128(DI)(BX*1)
 	VZEROUPPER
 	RET

 _1or2:
 	MOVB	AX, (DI)
 	MOVB	AX, -1(DI)(BX*1)
 	RET
 _0:
 	RET
 _3or4:
 	MOVW	AX, (DI)
 	MOVW	AX, -2(DI)(BX*1)
 	RET
 _5through7:
 	MOVL	AX, (DI)
 	MOVL	AX, -4(DI)(BX*1)
 	RET
 _8:
 	// We need a separate case for 8 to make sure we clear pointers atomically.
 	MOVQ	AX, (DI)
 	RET
 _9through16:
 	MOVQ	AX, (DI)
 	MOVQ	AX, -8(DI)(BX*1)
 	RET
 _17through32:
 	MOVOU	X15, (DI)
 	MOVOU	X15, -16(DI)(BX*1)
 	RET
 _33through64:
 	MOVOU	X15, (DI)
 	MOVOU	X15, 16(DI)
 	MOVOU	X15, -32(DI)(BX*1)
 	MOVOU	X15, -16(DI)(BX*1)
 	RET
 _65through128:
 	MOVOU	X15, (DI)
 	MOVOU	X15, 16(DI)
 	MOVOU	X15, 32(DI)
 	MOVOU	X15, 48(DI)
 	MOVOU	X15, -64(DI)(BX*1)
 	MOVOU	X15, -48(DI)(BX*1)
 	MOVOU	X15, -32(DI)(BX*1)
 	MOVOU	X15, -16(DI)(BX*1)
 	RET
 _129through256:
 	MOVOU	X15, (DI)
 	MOVOU	X15, 16(DI)
 	MOVOU	X15, 32(DI)
 	MOVOU	X15, 48(DI)
 	MOVOU	X15, 64(DI)
 	MOVOU	X15, 80(DI)
 	MOVOU	X15, 96(DI)
 	MOVOU	X15, 112(DI)
 	MOVOU	X15, -128(DI)(BX*1)
 	MOVOU	X15, -112(DI)(BX*1)
 	MOVOU	X15, -96(DI)(BX*1)
 	MOVOU	X15, -80(DI)(BX*1)
 	MOVOU	X15, -64(DI)(BX*1)
 	MOVOU	X15, -48(DI)(BX*1)
 	MOVOU	X15, -32(DI)(BX*1)
 	MOVOU	X15, -16(DI)(BX*1)
 	RET
	// Copyright 2014 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	//go:build !plan9

	#include "go_asm.h"
	#include "textflag.h"
	#include "asm_amd64.h"

	// See memclrNoHeapPointers Go doc for important implementation constraints.

	// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
	// ABIInternal for performance.
	TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
	// AX = ptr
	// BX = n
	MOVQ AX, DI // DI = ptr
	XORQ AX, AX

	// MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
	tail:
	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
	TESTQ BX, BX
	JEQ _0
	CMPQ BX, $2
	JBE _1or2
	CMPQ BX, $4
	JBE _3or4
	CMPQ BX, $8
	JB _5through7
	JE _8
	CMPQ BX, $16
	JBE _9through16
	CMPQ BX, $32
	JBE _17through32
	CMPQ BX, $64
	JBE _33through64
	CMPQ BX, $128
	JBE _65through128
	CMPQ BX, $256
	JBE _129through256

	CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
	JNE skip_erms

	// If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
	// Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
	// in the Intel Optimization Guide shows better performance for ERMSB starting
	// from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
	CMPQ BX, $2048
	JAE loop_preheader_erms

	skip_erms:
	#ifndef hasAVX2
	CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
	JE loop_preheader_avx2
	// TODO: for really big clears, use MOVNTDQ, even without AVX2.

	loop:
	MOVOU X15, 0(DI)
	MOVOU X15, 16(DI)
	MOVOU X15, 32(DI)
	MOVOU X15, 48(DI)
	MOVOU X15, 64(DI)
	MOVOU X15, 80(DI)
	MOVOU X15, 96(DI)
	MOVOU X15, 112(DI)
	MOVOU X15, 128(DI)
	MOVOU X15, 144(DI)
	MOVOU X15, 160(DI)
	MOVOU X15, 176(DI)
	MOVOU X15, 192(DI)
	MOVOU X15, 208(DI)
	MOVOU X15, 224(DI)
	MOVOU X15, 240(DI)
	SUBQ $256, BX
	ADDQ $256, DI
	CMPQ BX, $256
	JAE loop
	JMP tail
	#endif

	loop_preheader_avx2:
	VPXOR X0, X0, X0
	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
	CMPQ BX, $0x2000000
	JAE loop_preheader_avx2_huge

	loop_avx2:
	VMOVDQU Y0, 0(DI)
	VMOVDQU Y0, 32(DI)
	VMOVDQU Y0, 64(DI)
	VMOVDQU Y0, 96(DI)
	SUBQ $128, BX
	ADDQ $128, DI
	CMPQ BX, $128
	JAE loop_avx2
	VMOVDQU Y0, -32(DI)(BX*1)
	VMOVDQU Y0, -64(DI)(BX*1)
	VMOVDQU Y0, -96(DI)(BX*1)
	VMOVDQU Y0, -128(DI)(BX*1)
	VZEROUPPER
	RET

	loop_preheader_erms:
	#ifndef hasAVX2
	CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
	JNE loop_erms
	#endif

	VPXOR X0, X0, X0
	// At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
	// write protocol, ERMS could show the same or slower performance comparing to
	// Non-Temporal Stores when the size is bigger than LLC depending on hardware.
	CMPQ BX, $0x2000000
	JAE loop_preheader_avx2_huge

	loop_erms:
	// STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
	// for a memory subsystem as the GC requires this.
	MOVQ BX, CX
	SHRQ $3, CX
	ANDQ $7, BX
	REP; STOSQ
	JMP tail

	loop_preheader_avx2_huge:
	// Align to 32 byte boundary
	VMOVDQU Y0, 0(DI)
	MOVQ DI, SI
	ADDQ $32, DI
	ANDQ $~31, DI
	SUBQ DI, SI
	ADDQ SI, BX
	loop_avx2_huge:
	VMOVNTDQ Y0, 0(DI)
	VMOVNTDQ Y0, 32(DI)
	VMOVNTDQ Y0, 64(DI)
	VMOVNTDQ Y0, 96(DI)
	SUBQ $128, BX
	ADDQ $128, DI
	CMPQ BX, $128
	JAE loop_avx2_huge
	// In the description of MOVNTDQ in [1]
	// "... fencing operation implemented with the SFENCE or MFENCE instruction
	// should be used in conjunction with MOVNTDQ instructions..."
	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
	SFENCE
	VMOVDQU Y0, -32(DI)(BX*1)
	VMOVDQU Y0, -64(DI)(BX*1)
	VMOVDQU Y0, -96(DI)(BX*1)
	VMOVDQU Y0, -128(DI)(BX*1)
	VZEROUPPER
	RET

	_1or2:
	MOVB AX, (DI)
	MOVB AX, -1(DI)(BX*1)
	RET
	_0:
	RET
	_3or4:
	MOVW AX, (DI)
	MOVW AX, -2(DI)(BX*1)
	RET
	_5through7:
	MOVL AX, (DI)
	MOVL AX, -4(DI)(BX*1)
	RET
	_8:
	// We need a separate case for 8 to make sure we clear pointers atomically.
	MOVQ AX, (DI)
	RET
	_9through16:
	MOVQ AX, (DI)
	MOVQ AX, -8(DI)(BX*1)
	RET
	_17through32:
	MOVOU X15, (DI)
	MOVOU X15, -16(DI)(BX*1)
	RET
	_33through64:
	MOVOU X15, (DI)
	MOVOU X15, 16(DI)
	MOVOU X15, -32(DI)(BX*1)
	MOVOU X15, -16(DI)(BX*1)
	RET
	_65through128:
	MOVOU X15, (DI)
	MOVOU X15, 16(DI)
	MOVOU X15, 32(DI)
	MOVOU X15, 48(DI)
	MOVOU X15, -64(DI)(BX*1)
	MOVOU X15, -48(DI)(BX*1)
	MOVOU X15, -32(DI)(BX*1)
	MOVOU X15, -16(DI)(BX*1)
	RET
	_129through256:
	MOVOU X15, (DI)
	MOVOU X15, 16(DI)
	MOVOU X15, 32(DI)
	MOVOU X15, 48(DI)
	MOVOU X15, 64(DI)
	MOVOU X15, 80(DI)
	MOVOU X15, 96(DI)
	MOVOU X15, 112(DI)
	MOVOU X15, -128(DI)(BX*1)
	MOVOU X15, -112(DI)(BX*1)
	MOVOU X15, -96(DI)(BX*1)
	MOVOU X15, -80(DI)(BX*1)
	MOVOU X15, -64(DI)(BX*1)
	MOVOU X15, -48(DI)(BX*1)
	MOVOU X15, -32(DI)(BX*1)
	MOVOU X15, -16(DI)(BX*1)
	RET