Brad Fitzpatrick | 5194744 | 2016-03-01 22:57:46 +0000 | [diff] [blame] | 1 | // Copyright 2010 The Go Authors. All rights reserved. |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
Rob Pike | 8bca148 | 2014-08-12 17:04:45 -0700 | [diff] [blame] | 5 | #include "textflag.h" |
Keith Randall | 1f79663 | 2013-08-12 10:25:18 -0700 | [diff] [blame] | 6 | |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 7 | // The method is based on a paper by Naoki Shibata: "Efficient evaluation |
| 8 | // methods of elementary functions suitable for SIMD computation", Proc. |
| 9 | // of International Supercomputing Conference 2010 (ISC'10), pp. 25 -- 32 |
| 10 | // (May 2010). The paper is available at |
Tim Cooper | 161874d | 2018-06-01 17:29:59 -0300 | [diff] [blame] | 11 | // https://www.springerlink.com/content/340228x165742104/ |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 12 | // |
| 13 | // The original code and the constants below are from the author's |
| 14 | // implementation available at http://freshmeat.net/projects/sleef. |
| 15 | // The README file says, "The software is in public domain. |
| 16 | // You can use the software without any obligation." |
| 17 | // |
| 18 | // This code is a simplified version of the original. |
| 19 | |
| 20 | #define LN2 0.6931471805599453094172321214581766 // log_e(2) |
| 21 | #define LOG2E 1.4426950408889634073599246810018920 // 1/LN2 |
| 22 | #define LN2U 0.69314718055966295651160180568695068359375 // upper half LN2 |
| 23 | #define LN2L 0.28235290563031577122588448175013436025525412068e-12 // lower half LN2 |
Charles L. Dorian | c8c2bdb | 2010-08-06 16:50:48 -0700 | [diff] [blame] | 24 | #define PosInf 0x7FF0000000000000 |
| 25 | #define NegInf 0xFFF0000000000000 |
Alberto Donizetti | f44e587 | 2017-02-03 10:36:47 +0100 | [diff] [blame] | 26 | #define Overflow 7.09782712893384e+02 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 27 | |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 28 | DATA exprodata<>+0(SB)/8, $0.5 |
| 29 | DATA exprodata<>+8(SB)/8, $1.0 |
| 30 | DATA exprodata<>+16(SB)/8, $2.0 |
| 31 | DATA exprodata<>+24(SB)/8, $1.6666666666666666667e-1 |
| 32 | DATA exprodata<>+32(SB)/8, $4.1666666666666666667e-2 |
| 33 | DATA exprodata<>+40(SB)/8, $8.3333333333333333333e-3 |
| 34 | DATA exprodata<>+48(SB)/8, $1.3888888888888888889e-3 |
| 35 | DATA exprodata<>+56(SB)/8, $1.9841269841269841270e-4 |
| 36 | DATA exprodata<>+64(SB)/8, $2.4801587301587301587e-5 |
| 37 | GLOBL exprodata<>+0(SB), RODATA, $72 |
| 38 | |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 39 | // func Exp(x float64) float64 |
Keith Randall | 1f79663 | 2013-08-12 10:25:18 -0700 | [diff] [blame] | 40 | TEXT ·Exp(SB),NOSPLIT,$0 |
Alberto Donizetti | f44e587 | 2017-02-03 10:36:47 +0100 | [diff] [blame] | 41 | // test bits for not-finite |
Charles L. Dorian | c8c2bdb | 2010-08-06 16:50:48 -0700 | [diff] [blame] | 42 | MOVQ x+0(FP), BX |
| 43 | MOVQ $~(1<<63), AX // sign bit mask |
| 44 | MOVQ BX, DX |
| 45 | ANDQ AX, DX |
| 46 | MOVQ $PosInf, AX |
| 47 | CMPQ AX, DX |
| 48 | JLE notFinite |
Alberto Donizetti | f44e587 | 2017-02-03 10:36:47 +0100 | [diff] [blame] | 49 | // check if argument will overflow |
Charles L. Dorian | c8c2bdb | 2010-08-06 16:50:48 -0700 | [diff] [blame] | 50 | MOVQ BX, X0 |
Alberto Donizetti | f44e587 | 2017-02-03 10:36:47 +0100 | [diff] [blame] | 51 | MOVSD $Overflow, X1 |
| 52 | COMISD X1, X0 |
| 53 | JA overflow |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 54 | MOVSD $LOG2E, X1 |
| 55 | MULSD X0, X1 |
Charles L. Dorian | c8c2bdb | 2010-08-06 16:50:48 -0700 | [diff] [blame] | 56 | CVTSD2SL X1, BX // BX = exponent |
| 57 | CVTSL2SD BX, X1 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 58 | CMPB ·useFMA(SB), $1 |
| 59 | JE avxfma |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 60 | MOVSD $LN2U, X2 |
| 61 | MULSD X1, X2 |
| 62 | SUBSD X2, X0 |
| 63 | MOVSD $LN2L, X2 |
| 64 | MULSD X1, X2 |
| 65 | SUBSD X2, X0 |
| 66 | // reduce argument |
Charles L. Dorian | c8c2bdb | 2010-08-06 16:50:48 -0700 | [diff] [blame] | 67 | MULSD $0.0625, X0 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 68 | // Taylor series evaluation |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 69 | MOVSD exprodata<>+64(SB), X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 70 | MULSD X0, X1 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 71 | ADDSD exprodata<>+56(SB), X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 72 | MULSD X0, X1 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 73 | ADDSD exprodata<>+48(SB), X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 74 | MULSD X0, X1 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 75 | ADDSD exprodata<>+40(SB), X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 76 | MULSD X0, X1 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 77 | ADDSD exprodata<>+32(SB), X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 78 | MULSD X0, X1 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 79 | ADDSD exprodata<>+24(SB), X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 80 | MULSD X0, X1 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 81 | ADDSD exprodata<>+0(SB), X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 82 | MULSD X0, X1 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 83 | ADDSD exprodata<>+8(SB), X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 84 | MULSD X1, X0 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 85 | MOVSD exprodata<>+16(SB), X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 86 | ADDSD X0, X1 |
| 87 | MULSD X1, X0 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 88 | MOVSD exprodata<>+16(SB), X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 89 | ADDSD X0, X1 |
| 90 | MULSD X1, X0 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 91 | MOVSD exprodata<>+16(SB), X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 92 | ADDSD X0, X1 |
| 93 | MULSD X1, X0 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 94 | MOVSD exprodata<>+16(SB), X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 95 | ADDSD X0, X1 |
| 96 | MULSD X1, X0 |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 97 | ADDSD exprodata<>+8(SB), X0 |
Charles L. Dorian | c8c2bdb | 2010-08-06 16:50:48 -0700 | [diff] [blame] | 98 | // return fr * 2**exponent |
Wèi Cōngruì | 7fe2f54 | 2018-01-09 11:48:15 +0800 | [diff] [blame] | 99 | ldexp: |
| 100 | ADDL $0x3FF, BX // add bias |
| 101 | JLE denormal |
Charles L. Dorian | c8c2bdb | 2010-08-06 16:50:48 -0700 | [diff] [blame] | 102 | CMPL BX, $0x7FF |
| 103 | JGE overflow |
Wèi Cōngruì | 7fe2f54 | 2018-01-09 11:48:15 +0800 | [diff] [blame] | 104 | lastStep: |
| 105 | SHLQ $52, BX |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 106 | MOVQ BX, X1 |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 107 | MULSD X1, X0 |
Russ Cox | 07720b6 | 2013-03-22 12:57:55 -0400 | [diff] [blame] | 108 | MOVSD X0, ret+8(FP) |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 109 | RET |
Charles L. Dorian | c8c2bdb | 2010-08-06 16:50:48 -0700 | [diff] [blame] | 110 | notFinite: |
| 111 | // test bits for -Inf |
| 112 | MOVQ $NegInf, AX |
| 113 | CMPQ AX, BX |
| 114 | JNE notNegInf |
| 115 | // -Inf, return 0 |
| 116 | underflow: // return 0 |
Wèi Cōngruì | 7fe2f54 | 2018-01-09 11:48:15 +0800 | [diff] [blame] | 117 | MOVQ $0, ret+8(FP) |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 118 | RET |
Charles L. Dorian | c8c2bdb | 2010-08-06 16:50:48 -0700 | [diff] [blame] | 119 | overflow: // return +Inf |
| 120 | MOVQ $PosInf, BX |
| 121 | notNegInf: // NaN or +Inf, return x |
Russ Cox | 07720b6 | 2013-03-22 12:57:55 -0400 | [diff] [blame] | 122 | MOVQ BX, ret+8(FP) |
Charles L. Dorian | a0117ba | 2010-06-30 14:44:27 -0700 | [diff] [blame] | 123 | RET |
Wèi Cōngruì | 7fe2f54 | 2018-01-09 11:48:15 +0800 | [diff] [blame] | 124 | denormal: |
| 125 | CMPL BX, $-52 |
| 126 | JL underflow |
| 127 | ADDL $0x3FE, BX // add bias - 1 |
| 128 | SHLQ $52, BX |
| 129 | MOVQ BX, X1 |
| 130 | MULSD X1, X0 |
| 131 | MOVQ $1, BX |
| 132 | JMP lastStep |
Agniva De Sarker | d2f3172 | 2017-09-09 19:47:37 +0530 | [diff] [blame] | 133 | |
| 134 | avxfma: |
| 135 | MOVSD $LN2U, X2 |
| 136 | VFNMADD231SD X2, X1, X0 |
| 137 | MOVSD $LN2L, X2 |
| 138 | VFNMADD231SD X2, X1, X0 |
| 139 | // reduce argument |
| 140 | MULSD $0.0625, X0 |
| 141 | // Taylor series evaluation |
| 142 | MOVSD exprodata<>+64(SB), X1 |
| 143 | VFMADD213SD exprodata<>+56(SB), X0, X1 |
| 144 | VFMADD213SD exprodata<>+48(SB), X0, X1 |
| 145 | VFMADD213SD exprodata<>+40(SB), X0, X1 |
| 146 | VFMADD213SD exprodata<>+32(SB), X0, X1 |
| 147 | VFMADD213SD exprodata<>+24(SB), X0, X1 |
| 148 | VFMADD213SD exprodata<>+0(SB), X0, X1 |
| 149 | VFMADD213SD exprodata<>+8(SB), X0, X1 |
| 150 | MULSD X1, X0 |
| 151 | VADDSD exprodata<>+16(SB), X0, X1 |
| 152 | MULSD X1, X0 |
| 153 | VADDSD exprodata<>+16(SB), X0, X1 |
| 154 | MULSD X1, X0 |
| 155 | VADDSD exprodata<>+16(SB), X0, X1 |
| 156 | MULSD X1, X0 |
| 157 | VADDSD exprodata<>+16(SB), X0, X1 |
| 158 | VFMADD213SD exprodata<>+8(SB), X1, X0 |
Wèi Cōngruì | 7fe2f54 | 2018-01-09 11:48:15 +0800 | [diff] [blame] | 159 | JMP ldexp |