src/internal/runtime/gc/scan/scan_amd64.s - go - Git at Google

 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 #include "go_asm.h"
 #include "textflag.h"

 // Test-only.
 TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
 	MOVQ sizeClass+0(FP), CX
 	MOVQ packed+8(FP), AX

 	// Call the expander for this size class
 	LEAQ ·gcExpandersAVX512(SB), BX
 	CALL (BX)(CX*8)

 	MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
 	VMOVDQU64 Z1, 0(DI)
 	VMOVDQU64 Z2, 64(DI)
 	VZEROUPPER
 	RET

 TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
 	// Z1+Z2 = Expand the grey object mask into a grey word mask
 	MOVQ objMarks+16(FP), AX
 	MOVQ sizeClass+24(FP), CX
 	LEAQ ·gcExpandersAVX512(SB), BX
 	CALL (BX)(CX*8)

 	// Z3+Z4 = Load the pointer mask
 	MOVQ ptrMask+32(FP), AX
 	VMOVDQU64 0(AX), Z3
 	VMOVDQU64 64(AX), Z4

 	// Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask
 	VPANDQ Z1, Z3, Z1
 	VPANDQ Z2, Z4, Z2

 	// Now each bit of Z1+Z2 represents one word of the span.
 	// Thus, each byte covers 64 bytes of memory, which is also how
 	// much we can fix in a Z register.
 	//
 	// We do a load/compress for each 64 byte frame.
 	//
 	// Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame
 	VPOPCNTB Z1, Z3 // Requires BITALG
 	VPOPCNTB Z2, Z4

 	// Store the scan mask and word counts at 0(SP) and 128(SP).
 	//
 	// TODO: Is it better to read directly from the registers?
 	VMOVDQU64 Z1, 0(SP)
 	VMOVDQU64 Z2, 64(SP)
 	VMOVDQU64 Z3, 128(SP)
 	VMOVDQU64 Z4, 192(SP)

 	// SI = Current address in span
 	MOVQ mem+0(FP), SI
 	// DI = Scan buffer base
 	MOVQ bufp+8(FP), DI
 	// DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer
 	MOVQ $0, DX

 	// AX = address in scan mask, 128(AX) = address in popcount
 	LEAQ 0(SP), AX

 	// Loop over the 64 byte frames in this span.
 	// BX = 1 past the end of the scan mask
 	LEAQ 128(SP), BX

 	// Align loop to a cache line so that performance is less sensitive
 	// to how this function ends up laid out in memory. This is a hot
 	// function in the GC, and this is a tight loop. We don't want
 	// performance to waver wildly due to unrelated changes.
 	PCALIGN $64
 loop:
 	// CX = Fetch the mask of words to load from this frame.
 	MOVBQZX 0(AX), CX
 	// Skip empty frames.
 	TESTQ CX, CX
 	JZ skip

 	// Load the 64 byte frame.
 	KMOVB CX, K1
 	VMOVDQA64 0(SI), Z1

 	// Collect just the pointers from the greyed objects into the scan buffer,
 	// i.e., copy the word indices in the mask from Z1 into contiguous memory.
 	//
 	// N.B. VPCOMPRESSQ supports a memory destination. Unfortunately, on
 	// AMD Genoa / Zen 4, using VPCOMPRESSQ with a memory destination
 	// imposes a severe performance penalty of around an order of magnitude
 	// compared to a register destination.
 	//
 	// This workaround is unfortunate on other microarchitectures, where a
 	// memory destination is slightly faster than adding an additional move
 	// instruction, but no where near an order of magnitude. It would be
 	// nice to have a Genoa-only variant here.
 	//
 	// AMD Turin / Zen 5 fixes this issue.
 	//
 	// See
 	// https://lemire.me/blog/2025/02/14/avx-512-gotcha-avoid-compressing-words-to-memory-with-amd-zen-4-processors/.
 	VPCOMPRESSQ Z1, K1, Z2
 	VMOVDQU64 Z2, (DI)(DX*8)

 	// Advance the scan buffer position by the number of pointers.
 	MOVBQZX 128(AX), CX
 	ADDQ CX, DX

 skip:
 	ADDQ $64, SI
 	ADDQ $1, AX
 	CMPQ AX, BX
 	JB loop

 end:
 	MOVL DX, count+40(FP)
 	VZEROUPPER
 	RET
	// Copyright 2025 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	#include "go_asm.h"
	#include "textflag.h"

	// Test-only.
	TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
	MOVQ sizeClass+0(FP), CX
	MOVQ packed+8(FP), AX

	// Call the expander for this size class
	LEAQ ·gcExpandersAVX512(SB), BX
	CALL (BX)(CX*8)

	MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
	VMOVDQU64 Z1, 0(DI)
	VMOVDQU64 Z2, 64(DI)
	VZEROUPPER
	RET

	TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
	// Z1+Z2 = Expand the grey object mask into a grey word mask
	MOVQ objMarks+16(FP), AX
	MOVQ sizeClass+24(FP), CX
	LEAQ ·gcExpandersAVX512(SB), BX
	CALL (BX)(CX*8)

	// Z3+Z4 = Load the pointer mask
	MOVQ ptrMask+32(FP), AX
	VMOVDQU64 0(AX), Z3
	VMOVDQU64 64(AX), Z4

	// Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask
	VPANDQ Z1, Z3, Z1
	VPANDQ Z2, Z4, Z2

	// Now each bit of Z1+Z2 represents one word of the span.
	// Thus, each byte covers 64 bytes of memory, which is also how
	// much we can fix in a Z register.
	//
	// We do a load/compress for each 64 byte frame.
	//
	// Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame
	VPOPCNTB Z1, Z3 // Requires BITALG
	VPOPCNTB Z2, Z4

	// Store the scan mask and word counts at 0(SP) and 128(SP).
	//
	// TODO: Is it better to read directly from the registers?
	VMOVDQU64 Z1, 0(SP)
	VMOVDQU64 Z2, 64(SP)
	VMOVDQU64 Z3, 128(SP)
	VMOVDQU64 Z4, 192(SP)

	// SI = Current address in span
	MOVQ mem+0(FP), SI
	// DI = Scan buffer base
	MOVQ bufp+8(FP), DI
	// DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer
	MOVQ $0, DX

	// AX = address in scan mask, 128(AX) = address in popcount
	LEAQ 0(SP), AX

	// Loop over the 64 byte frames in this span.
	// BX = 1 past the end of the scan mask
	LEAQ 128(SP), BX

	// Align loop to a cache line so that performance is less sensitive
	// to how this function ends up laid out in memory. This is a hot
	// function in the GC, and this is a tight loop. We don't want
	// performance to waver wildly due to unrelated changes.
	PCALIGN $64
	loop:
	// CX = Fetch the mask of words to load from this frame.
	MOVBQZX 0(AX), CX
	// Skip empty frames.
	TESTQ CX, CX
	JZ skip

	// Load the 64 byte frame.
	KMOVB CX, K1
	VMOVDQA64 0(SI), Z1

	// Collect just the pointers from the greyed objects into the scan buffer,
	// i.e., copy the word indices in the mask from Z1 into contiguous memory.
	//
	// N.B. VPCOMPRESSQ supports a memory destination. Unfortunately, on
	// AMD Genoa / Zen 4, using VPCOMPRESSQ with a memory destination
	// imposes a severe performance penalty of around an order of magnitude
	// compared to a register destination.
	//
	// This workaround is unfortunate on other microarchitectures, where a
	// memory destination is slightly faster than adding an additional move
	// instruction, but no where near an order of magnitude. It would be
	// nice to have a Genoa-only variant here.
	//
	// AMD Turin / Zen 5 fixes this issue.
	//
	// See
	// https://lemire.me/blog/2025/02/14/avx-512-gotcha-avoid-compressing-words-to-memory-with-amd-zen-4-processors/.
	VPCOMPRESSQ Z1, K1, Z2
	VMOVDQU64 Z2, (DI)(DX*8)

	// Advance the scan buffer position by the number of pointers.
	MOVBQZX 128(AX), CX
	ADDQ CX, DX

	skip:
	ADDQ $64, SI
	ADDQ $1, AX
	CMPQ AX, BX
	JB loop

	end:
	MOVL DX, count+40(FP)
	VZEROUPPER
	RET