| // Inferno's libkern/memmove-arm.s |
| // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-arm.s |
| // |
| // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. |
| // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. |
| // Portions Copyright 2009 The Go Authors. All rights reserved. |
| // |
| // Permission is hereby granted, free of charge, to any person obtaining a copy |
| // of this software and associated documentation files (the "Software"), to deal |
| // in the Software without restriction, including without limitation the rights |
| // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| // copies of the Software, and to permit persons to whom the Software is |
| // furnished to do so, subject to the following conditions: |
| // |
| // The above copyright notice and this permission notice shall be included in |
| // all copies or substantial portions of the Software. |
| // |
| // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| // THE SOFTWARE. |
| |
| #include "textflag.h" |
| |
| // TE or TS are spilled to the stack during bulk register moves. |
| TS = 0 |
| TE = 8 |
| |
| // Warning: the linker will use R11 to synthesize certain instructions. Please |
| // take care and double check with objdump. |
| FROM = 11 |
| N = 12 |
| TMP = 12 /* N and TMP don't overlap */ |
| TMP1 = 5 |
| |
| RSHIFT = 5 |
| LSHIFT = 6 |
| OFFSET = 7 |
| |
| BR0 = 0 /* shared with TS */ |
| BW0 = 1 |
| BR1 = 1 |
| BW1 = 2 |
| BR2 = 2 |
| BW2 = 3 |
| BR3 = 3 |
| BW3 = 4 |
| |
| FW0 = 1 |
| FR0 = 2 |
| FW1 = 2 |
| FR1 = 3 |
| FW2 = 3 |
| FR2 = 4 |
| FW3 = 4 |
| FR3 = 8 /* shared with TE */ |
| |
| TEXT runtime·memmove(SB), NOSPLIT, $4-12 |
| _memmove: |
| MOVW to+0(FP), R(TS) |
| MOVW from+4(FP), R(FROM) |
| MOVW n+8(FP), R(N) |
| |
| ADD R(N), R(TS), R(TE) /* to end pointer */ |
| |
| CMP R(FROM), R(TS) |
| BLS _forward |
| |
| _back: |
| ADD R(N), R(FROM) /* from end pointer */ |
| CMP $4, R(N) /* need at least 4 bytes to copy */ |
| BLT _b1tail |
| |
| _b4align: /* align destination on 4 */ |
| AND.S $3, R(TE), R(TMP) |
| BEQ _b4aligned |
| |
| MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */ |
| MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */ |
| B _b4align |
| |
| _b4aligned: /* is source now aligned? */ |
| AND.S $3, R(FROM), R(TMP) |
| BNE _bunaligned |
| |
| ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */ |
| MOVW R(TS), savedts-4(SP) |
| _b32loop: |
| CMP R(TMP), R(TE) |
| BLS _b4tail |
| |
| MOVM.DB.W (R(FROM)), [R0-R7] |
| MOVM.DB.W [R0-R7], (R(TE)) |
| B _b32loop |
| |
| _b4tail: /* do remaining words if possible */ |
| MOVW savedts-4(SP), R(TS) |
| ADD $3, R(TS), R(TMP) |
| _b4loop: |
| CMP R(TMP), R(TE) |
| BLS _b1tail |
| |
| MOVW.W -4(R(FROM)), R(TMP1) /* pre-indexed */ |
| MOVW.W R(TMP1), -4(R(TE)) /* pre-indexed */ |
| B _b4loop |
| |
| _b1tail: /* remaining bytes */ |
| CMP R(TE), R(TS) |
| BEQ _return |
| |
| MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */ |
| MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */ |
| B _b1tail |
| |
| _forward: |
| CMP $4, R(N) /* need at least 4 bytes to copy */ |
| BLT _f1tail |
| |
| _f4align: /* align destination on 4 */ |
| AND.S $3, R(TS), R(TMP) |
| BEQ _f4aligned |
| |
| MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */ |
| MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */ |
| B _f4align |
| |
| _f4aligned: /* is source now aligned? */ |
| AND.S $3, R(FROM), R(TMP) |
| BNE _funaligned |
| |
| SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */ |
| MOVW R(TE), savedte-4(SP) |
| _f32loop: |
| CMP R(TMP), R(TS) |
| BHS _f4tail |
| |
| MOVM.IA.W (R(FROM)), [R1-R8] |
| MOVM.IA.W [R1-R8], (R(TS)) |
| B _f32loop |
| |
| _f4tail: |
| MOVW savedte-4(SP), R(TE) |
| SUB $3, R(TE), R(TMP) /* do remaining words if possible */ |
| _f4loop: |
| CMP R(TMP), R(TS) |
| BHS _f1tail |
| |
| MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */ |
| MOVW.P R(TMP1), 4(R(TS)) /* implicit write back */ |
| B _f4loop |
| |
| _f1tail: |
| CMP R(TS), R(TE) |
| BEQ _return |
| |
| MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */ |
| MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */ |
| B _f1tail |
| |
| _return: |
| MOVW to+0(FP), R0 |
| RET |
| |
| _bunaligned: |
| CMP $2, R(TMP) /* is R(TMP) < 2 ? */ |
| |
| MOVW.LT $8, R(RSHIFT) /* (R(n)<<24)|(R(n-1)>>8) */ |
| MOVW.LT $24, R(LSHIFT) |
| MOVW.LT $1, R(OFFSET) |
| |
| MOVW.EQ $16, R(RSHIFT) /* (R(n)<<16)|(R(n-1)>>16) */ |
| MOVW.EQ $16, R(LSHIFT) |
| MOVW.EQ $2, R(OFFSET) |
| |
| MOVW.GT $24, R(RSHIFT) /* (R(n)<<8)|(R(n-1)>>24) */ |
| MOVW.GT $8, R(LSHIFT) |
| MOVW.GT $3, R(OFFSET) |
| |
| ADD $16, R(TS), R(TMP) /* do 16-byte chunks if possible */ |
| CMP R(TMP), R(TE) |
| BLS _b1tail |
| |
| BIC $3, R(FROM) /* align source */ |
| MOVW R(TS), savedts-4(SP) |
| MOVW (R(FROM)), R(BR0) /* prime first block register */ |
| |
| _bu16loop: |
| CMP R(TMP), R(TE) |
| BLS _bu1tail |
| |
| MOVW R(BR0)<<R(LSHIFT), R(BW3) |
| MOVM.DB.W (R(FROM)), [R(BR0)-R(BR3)] |
| ORR R(BR3)>>R(RSHIFT), R(BW3) |
| |
| MOVW R(BR3)<<R(LSHIFT), R(BW2) |
| ORR R(BR2)>>R(RSHIFT), R(BW2) |
| |
| MOVW R(BR2)<<R(LSHIFT), R(BW1) |
| ORR R(BR1)>>R(RSHIFT), R(BW1) |
| |
| MOVW R(BR1)<<R(LSHIFT), R(BW0) |
| ORR R(BR0)>>R(RSHIFT), R(BW0) |
| |
| MOVM.DB.W [R(BW0)-R(BW3)], (R(TE)) |
| B _bu16loop |
| |
| _bu1tail: |
| MOVW savedts-4(SP), R(TS) |
| ADD R(OFFSET), R(FROM) |
| B _b1tail |
| |
| _funaligned: |
| CMP $2, R(TMP) |
| |
| MOVW.LT $8, R(RSHIFT) /* (R(n+1)<<24)|(R(n)>>8) */ |
| MOVW.LT $24, R(LSHIFT) |
| MOVW.LT $3, R(OFFSET) |
| |
| MOVW.EQ $16, R(RSHIFT) /* (R(n+1)<<16)|(R(n)>>16) */ |
| MOVW.EQ $16, R(LSHIFT) |
| MOVW.EQ $2, R(OFFSET) |
| |
| MOVW.GT $24, R(RSHIFT) /* (R(n+1)<<8)|(R(n)>>24) */ |
| MOVW.GT $8, R(LSHIFT) |
| MOVW.GT $1, R(OFFSET) |
| |
| SUB $16, R(TE), R(TMP) /* do 16-byte chunks if possible */ |
| CMP R(TMP), R(TS) |
| BHS _f1tail |
| |
| BIC $3, R(FROM) /* align source */ |
| MOVW R(TE), savedte-4(SP) |
| MOVW.P 4(R(FROM)), R(FR3) /* prime last block register, implicit write back */ |
| |
| _fu16loop: |
| CMP R(TMP), R(TS) |
| BHS _fu1tail |
| |
| MOVW R(FR3)>>R(RSHIFT), R(FW0) |
| MOVM.IA.W (R(FROM)), [R(FR0),R(FR1),R(FR2),R(FR3)] |
| ORR R(FR0)<<R(LSHIFT), R(FW0) |
| |
| MOVW R(FR0)>>R(RSHIFT), R(FW1) |
| ORR R(FR1)<<R(LSHIFT), R(FW1) |
| |
| MOVW R(FR1)>>R(RSHIFT), R(FW2) |
| ORR R(FR2)<<R(LSHIFT), R(FW2) |
| |
| MOVW R(FR2)>>R(RSHIFT), R(FW3) |
| ORR R(FR3)<<R(LSHIFT), R(FW3) |
| |
| MOVM.IA.W [R(FW0),R(FW1),R(FW2),R(FW3)], (R(TS)) |
| B _fu16loop |
| |
| _fu1tail: |
| MOVW savedte-4(SP), R(TE) |
| SUB R(OFFSET), R(FROM) |
| B _f1tail |