blob: acf2b06665a34239b17341a1f136ae9b2b08a6d4 [file] [log] [blame]
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !math_big_pure_go
// +build !math_big_pure_go
#include "textflag.h"
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),NOSPLIT,$0
MOVL x+0(FP), AX
MULL y+4(FP)
MOVL DX, z1+8(FP)
MOVL AX, z0+12(FP)
RET
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL y+24(FP), CX
MOVL z_len+4(FP), BP
MOVL $0, BX // i = 0
MOVL $0, DX // c = 0
JMP E1
L1: MOVL (SI)(BX*4), AX
ADDL DX, DX // restore CF
ADCL (CX)(BX*4), AX
SBBL DX, DX // save CF
MOVL AX, (DI)(BX*4)
ADDL $1, BX // i++
E1: CMPL BX, BP // i < n
JL L1
NEGL DX
MOVL DX, c+36(FP)
RET
// func subVV(z, x, y []Word) (c Word)
// (same as addVV except for SBBL instead of ADCL and label names)
TEXT ·subVV(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL y+24(FP), CX
MOVL z_len+4(FP), BP
MOVL $0, BX // i = 0
MOVL $0, DX // c = 0
JMP E2
L2: MOVL (SI)(BX*4), AX
ADDL DX, DX // restore CF
SBBL (CX)(BX*4), AX
SBBL DX, DX // save CF
MOVL AX, (DI)(BX*4)
ADDL $1, BX // i++
E2: CMPL BX, BP // i < n
JL L2
NEGL DX
MOVL DX, c+36(FP)
RET
// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL y+24(FP), AX // c = y
MOVL z_len+4(FP), BP
MOVL $0, BX // i = 0
JMP E3
L3: ADDL (SI)(BX*4), AX
MOVL AX, (DI)(BX*4)
SBBL AX, AX // save CF
NEGL AX
ADDL $1, BX // i++
E3: CMPL BX, BP // i < n
JL L3
MOVL AX, c+28(FP)
RET
// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL y+24(FP), AX // c = y
MOVL z_len+4(FP), BP
MOVL $0, BX // i = 0
JMP E4
L4: MOVL (SI)(BX*4), DX
SUBL AX, DX
MOVL DX, (DI)(BX*4)
SBBL AX, AX // save CF
NEGL AX
ADDL $1, BX // i++
E4: CMPL BX, BP // i < n
JL L4
MOVL AX, c+28(FP)
RET
// func shlVU(z, x []Word, s uint) (c Word)
TEXT ·shlVU(SB),NOSPLIT,$0
MOVL z_len+4(FP), BX // i = z
SUBL $1, BX // i--
JL X8b // i < 0 (n <= 0)
// n > 0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL s+24(FP), CX
MOVL (SI)(BX*4), AX // w1 = x[n-1]
MOVL $0, DX
SHLL CX, AX, DX // w1>>ŝ
MOVL DX, c+28(FP)
CMPL BX, $0
JLE X8a // i <= 0
// i > 0
L8: MOVL AX, DX // w = w1
MOVL -4(SI)(BX*4), AX // w1 = x[i-1]
SHLL CX, AX, DX // w<<s | w1>>ŝ
MOVL DX, (DI)(BX*4) // z[i] = w<<s | w1>>ŝ
SUBL $1, BX // i--
JG L8 // i > 0
// i <= 0
X8a: SHLL CX, AX // w1<<s
MOVL AX, (DI) // z[0] = w1<<s
RET
X8b: MOVL $0, c+28(FP)
RET
// func shrVU(z, x []Word, s uint) (c Word)
TEXT ·shrVU(SB),NOSPLIT,$0
MOVL z_len+4(FP), BP
SUBL $1, BP // n--
JL X9b // n < 0 (n <= 0)
// n > 0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL s+24(FP), CX
MOVL (SI), AX // w1 = x[0]
MOVL $0, DX
SHRL CX, AX, DX // w1<<ŝ
MOVL DX, c+28(FP)
MOVL $0, BX // i = 0
JMP E9
// i < n-1
L9: MOVL AX, DX // w = w1
MOVL 4(SI)(BX*4), AX // w1 = x[i+1]
SHRL CX, AX, DX // w>>s | w1<<ŝ
MOVL DX, (DI)(BX*4) // z[i] = w>>s | w1<<ŝ
ADDL $1, BX // i++
E9: CMPL BX, BP
JL L9 // i < n-1
// i >= n-1
X9a: SHRL CX, AX // w1>>s
MOVL AX, (DI)(BP*4) // z[n-1] = w1>>s
RET
X9b: MOVL $0, c+28(FP)
RET
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL y+24(FP), BP
MOVL r+28(FP), CX // c = r
MOVL z_len+4(FP), BX
LEAL (DI)(BX*4), DI
LEAL (SI)(BX*4), SI
NEGL BX // i = -n
JMP E5
L5: MOVL (SI)(BX*4), AX
MULL BP
ADDL CX, AX
ADCL $0, DX
MOVL AX, (DI)(BX*4)
MOVL DX, CX
ADDL $1, BX // i++
E5: CMPL BX, $0 // i < 0
JL L5
MOVL CX, c+32(FP)
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB),NOSPLIT,$0
MOVL z+0(FP), DI
MOVL x+12(FP), SI
MOVL y+24(FP), BP
MOVL z_len+4(FP), BX
LEAL (DI)(BX*4), DI
LEAL (SI)(BX*4), SI
NEGL BX // i = -n
MOVL $0, CX // c = 0
JMP E6
L6: MOVL (SI)(BX*4), AX
MULL BP
ADDL CX, AX
ADCL $0, DX
ADDL AX, (DI)(BX*4)
ADCL $0, DX
MOVL DX, CX
ADDL $1, BX // i++
E6: CMPL BX, $0 // i < 0
JL L6
MOVL CX, c+28(FP)
RET