internal/bytealg: implement bytes.Count in asm for ppc64x
This adds an asm implementation for the Count function in ppc64x.
The Go code that manipulates a byte at a time is especially
inefficient on ppc64x, so an asm implementation is a significant
improvement.
bytes:
name old time/op new time/op delta
CountSingle/10-8 23.1ns ± 0% 18.6ns ± 0% -19.48% (p=1.000 n=1+1)
CountSingle/32-8 60.4ns ± 0% 19.0ns ± 0% -68.54% (p=1.000 n=1+1)
CountSingle/4K-8 7.29µs ± 0% 0.45µs ± 0% -93.80% (p=1.000 n=1+1)
CountSingle/4M-8 7.49ms ± 0% 0.45ms ± 0% -93.97% (p=1.000 n=1+1)
CountSingle/64M-8 127ms ± 0% 9ms ± 0% -92.53% (p=1.000 n=1+1)
html:
name old time/op new time/op delta
Escape-8 57.5µs ± 0% 36.1µs ± 0% -37.13% (p=1.000 n=1+1)
EscapeNone-8 20.0µs ± 0% 2.0µs ± 0% -90.14% (p=1.000 n=1+1)
Change-Id: Iadbf422c0e9a37b47d2d95fb8c778420f3aabb58
Reviewed-on: https://go-review.googlesource.com/131695
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Munday <mike.munday@ibm.com>
diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go
index a763b3b..e24b2b7 100644
--- a/src/internal/bytealg/count_generic.go
+++ b/src/internal/bytealg/count_generic.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build !amd64,!arm64
+// +build !amd64,!arm64,!ppc64le,!ppc64
package bytealg
diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go
index a62c4cb..e6a91b3 100644
--- a/src/internal/bytealg/count_native.go
+++ b/src/internal/bytealg/count_native.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build amd64 arm64
+// +build amd64 arm64 ppc64le ppc64
package bytealg
diff --git a/src/internal/bytealg/count_ppc64x.s b/src/internal/bytealg/count_ppc64x.s
new file mode 100644
index 0000000..7abdce1
--- /dev/null
+++ b/src/internal/bytealg/count_ppc64x.s
@@ -0,0 +1,97 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64le ppc64
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
+ MOVD b_base+0(FP), R3 // R3 = byte array pointer
+ MOVD b_len+8(FP), R4 // R4 = length
+ MOVBZ c+24(FP), R5 // R5 = byte
+ MOVD $ret+32(FP), R14 // R14 = &ret
+ BR countbytebody<>(SB)
+
+TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
+ MOVD s_base+0(FP), R3 // R3 = string
+ MOVD s_len+8(FP), R4 // R4 = length
+ MOVBZ c+16(FP), R5 // R5 = byte
+ MOVD $ret+24(FP), R14 // R14 = &ret
+ BR countbytebody<>(SB)
+
+// R3: addr of string
+// R4: len of string
+// R5: byte to count
+// R14: addr for return value
+// endianness shouldn't matter since we are just counting and order
+// is irrelevant
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+ DCBT (R3) // Prepare cache line.
+ MOVD R0, R18 // byte count
+ MOVD R3, R19 // Save base address for calculating the index later.
+ MOVD R4, R16
+
+ MOVD R5, R6
+ RLDIMI $8, R6, $48, R6
+ RLDIMI $16, R6, $32, R6
+ RLDIMI $32, R6, $0, R6 // fill reg with the byte to count
+
+ VSPLTISW $3, V4 // used for shift
+ MTVRD R6, V1 // move compare byte
+ VSPLTB $7, V1, V1 // replicate byte across V1
+
+ CMPU R4, $32 // Check if it's a small string (<32 bytes)
+ BLT tail // Jump to the small string case
+ XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator
+
+cmploop:
+ LXVW4X (R3), VS32 // load bytes from string
+
+ // when the bytes match, the corresonding byte contains all 1s
+ VCMPEQUB V1, V0, V2 // compare bytes
+ VPOPCNTD V2, V3 // each double word contains its count
+ VADDUDM V3, V5, V5 // accumulate bit count in each double word
+ ADD $16, R3, R3 // increment pointer
+ SUB $16, R16, R16 // remaining bytes
+ CMP R16, $16 // at least 16 remaining?
+ BGE cmploop
+ VSRD V5, V4, V5 // shift by 3 to convert bits to bytes
+ VSLDOI $8, V5, V5, V6 // get the double word values from vector
+ MFVSRD V5, R9
+ MFVSRD V6, R10
+ ADD R9, R10, R9
+ ADD R9, R18, R18
+
+tail:
+ CMP R16, $8 // 8 bytes left?
+ BLT small
+
+ MOVD (R3), R12 // load 8 bytes
+ CMPB R12, R6, R17 // compare bytes
+ POPCNTD R17, R15 // bit count
+ SRD $3, R15, R15 // byte count
+ ADD R15, R18, R18 // add to byte count
+
+next1:
+ ADD $8, R3, R3
+ SUB $8, R16, R16 // remaining bytes
+ BR tail
+
+small:
+ CMP $0, R16 // any remaining
+ BEQ done
+ MOVBZ (R3), R12 // check each remaining byte
+ CMP R12, R5
+ BNE next2
+ ADD $1, R18
+
+next2:
+ SUB $1, R16
+ ADD $1, R3 // inc address
+ BR small
+
+done:
+ MOVD R18, (R14) // return count
+ RET