| // Copyright 2023 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build ignore |
| |
| // Generate Go assembly for XORing CTR output to n blocks at once with one key. |
| package main |
| |
| import ( |
| "fmt" |
| "os" |
| "strings" |
| "text/template" |
| ) |
| |
| // First registers in their groups. |
| const ( |
| blockOffset = 0 |
| roundKeyOffset = 8 |
| dstOffset = 23 |
| ) |
| |
| var tmplArm64Str = ` |
| // Code generated by ctr_arm64_gen.go. DO NOT EDIT. |
| |
| //go:build !purego |
| |
| #include "textflag.h" |
| |
| #define NR R9 |
| #define XK R10 |
| #define DST R11 |
| #define SRC R12 |
| #define IV_LOW_LE R16 |
| #define IV_HIGH_LE R17 |
| #define IV_LOW_BE R19 |
| #define IV_HIGH_BE R20 |
| |
| // V0.B16 - V7.B16 are for blocks (<=8). See BLOCK_OFFSET. |
| // V8.B16 - V22.B16 are for <=15 round keys (<=15). See ROUND_KEY_OFFSET. |
| // V23.B16 - V30.B16 are for destinations (<=8). See DST_OFFSET. |
| |
| {{define "load_keys"}} |
| {{- range regs_batches (round_key_reg $.FirstKey) $.NKeys }} |
| VLD1.P {{ .Size }}(XK), [{{ .Regs }}] |
| {{- end }} |
| {{ end }} |
| |
| {{define "enc"}} |
| {{ range $i := xrange $.N -}} |
| AESE V{{ round_key_reg $.Key}}.B16, V{{ block_reg $i }}.B16 |
| {{- if $.WithMc }} |
| AESMC V{{ block_reg $i }}.B16, V{{ block_reg $i }}.B16 |
| {{- end }} |
| {{ end }} |
| {{ end }} |
| |
| {{ range $N := $.Sizes }} |
| // func ctrBlocks{{$N}}Asm(nr int, xk *[60]uint32, dst *[{{$N}}*16]byte, src *[{{$N}}*16]byte, ivlo uint64, ivhi uint64) |
| TEXT ·ctrBlocks{{ $N }}Asm(SB),NOSPLIT,$0 |
| MOVD nr+0(FP), NR |
| MOVD xk+8(FP), XK |
| MOVD dst+16(FP), DST |
| MOVD src+24(FP), SRC |
| MOVD ivlo+32(FP), IV_LOW_LE |
| MOVD ivhi+40(FP), IV_HIGH_LE |
| |
| {{/* Prepare plain from IV and blockIndex. */}} |
| |
| {{/* Copy to plaintext registers. */}} |
| {{ range $i := xrange $N }} |
| REV IV_LOW_LE, IV_LOW_BE |
| REV IV_HIGH_LE, IV_HIGH_BE |
| {{- /* https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/MOV--vector--from-general- */}} |
| VMOV IV_LOW_BE, V{{ block_reg $i }}.D[1] |
| VMOV IV_HIGH_BE, V{{ block_reg $i }}.D[0] |
| {{- if ne (add $i 1) $N }} |
| ADDS $1, IV_LOW_LE |
| ADC $0, IV_HIGH_LE |
| {{ end }} |
| {{ end }} |
| |
| {{/* Num rounds branching. */}} |
| CMP $12, NR |
| BLT Lenc128 |
| BEQ Lenc192 |
| |
| {{/* 2 extra rounds for 256-bit keys. */}} |
| Lenc256: |
| {{- template "load_keys" (load_keys_args 0 2) }} |
| {{- template "enc" (enc_args 0 $N true) }} |
| {{- template "enc" (enc_args 1 $N true) }} |
| |
| {{/* 2 extra rounds for 192-bit keys. */}} |
| Lenc192: |
| {{- template "load_keys" (load_keys_args 2 2) }} |
| {{- template "enc" (enc_args 2 $N true) }} |
| {{- template "enc" (enc_args 3 $N true) }} |
| |
| {{/* 10 rounds for 128-bit (with special handling for final). */}} |
| Lenc128: |
| {{- template "load_keys" (load_keys_args 4 11) }} |
| {{- range $r := xrange 9 }} |
| {{- template "enc" (enc_args (add $r 4) $N true) }} |
| {{ end }} |
| {{ template "enc" (enc_args 13 $N false) }} |
| |
| {{/* We need to XOR blocks with the last round key (key 14, register V22). */}} |
| {{ range $i := xrange $N }} |
| VEOR V{{ block_reg $i }}.B16, V{{ round_key_reg 14 }}.B16, V{{ block_reg $i }}.B16 |
| {{- end }} |
| |
| {{/* XOR results to destination. */}} |
| {{- range regs_batches $.DstOffset $N }} |
| VLD1.P {{ .Size }}(SRC), [{{ .Regs }}] |
| {{- end }} |
| {{- range $i := xrange $N }} |
| VEOR V{{ add $.DstOffset $i }}.B16, V{{ block_reg $i }}.B16, V{{ add $.DstOffset $i }}.B16 |
| {{- end }} |
| {{- range regs_batches $.DstOffset $N }} |
| VST1.P [{{ .Regs }}], {{ .Size }}(DST) |
| {{- end }} |
| |
| RET |
| {{ end }} |
| ` |
| |
| func main() { |
| type Params struct { |
| DstOffset int |
| Sizes []int |
| } |
| |
| params := Params{ |
| DstOffset: dstOffset, |
| Sizes: []int{1, 2, 4, 8}, |
| } |
| |
| type RegsBatch struct { |
| Size int |
| Regs string // Comma-separated list of registers. |
| } |
| |
| type LoadKeysArgs struct { |
| FirstKey int |
| NKeys int |
| } |
| |
| type EncArgs struct { |
| Key int |
| N int |
| WithMc bool |
| } |
| |
| funcs := template.FuncMap{ |
| "add": func(a, b int) int { |
| return a + b |
| }, |
| "xrange": func(n int) []int { |
| result := make([]int, n) |
| for i := 0; i < n; i++ { |
| result[i] = i |
| } |
| return result |
| }, |
| "block_reg": func(block int) int { |
| return blockOffset + block |
| }, |
| "round_key_reg": func(key int) int { |
| return roundKeyOffset + key |
| }, |
| "regs_batches": func(firstReg, nregs int) []RegsBatch { |
| result := make([]RegsBatch, 0) |
| for nregs != 0 { |
| batch := 4 |
| if nregs < batch { |
| batch = nregs |
| } |
| regsList := make([]string, 0, batch) |
| for j := firstReg; j < firstReg+batch; j++ { |
| regsList = append(regsList, fmt.Sprintf("V%d.B16", j)) |
| } |
| result = append(result, RegsBatch{ |
| Size: 16 * batch, |
| Regs: strings.Join(regsList, ", "), |
| }) |
| nregs -= batch |
| firstReg += batch |
| } |
| return result |
| }, |
| "enc_args": func(key, n int, withMc bool) EncArgs { |
| return EncArgs{ |
| Key: key, |
| N: n, |
| WithMc: withMc, |
| } |
| }, |
| "load_keys_args": func(firstKey, nkeys int) LoadKeysArgs { |
| return LoadKeysArgs{ |
| FirstKey: firstKey, |
| NKeys: nkeys, |
| } |
| }, |
| } |
| |
| var tmpl = template.Must(template.New("ctr_arm64").Funcs(funcs).Parse(tmplArm64Str)) |
| |
| if err := tmpl.Execute(os.Stdout, params); err != nil { |
| panic(err) |
| } |
| } |