crypto: arm/ghash - Move NEON GHASH assembly into its own file
arch/arm/crypto/ghash-ce-core.S implements pmull_ghash_update_p8(), which is used only by a crypto_shash implementation of GHASH. It also implements other functions, including pmull_ghash_update_p64() and others, which are used only by a crypto_aead implementation of AES-GCM. While some code is shared between pmull_ghash_update_p8() and pmull_ghash_update_p64(), it's not very much. Since pmull_ghash_update_p8() will also need to be migrated into lib/crypto/ to achieve parity in the standalone GHASH support, let's move it into a separate file ghash-neon-core.S. Acked-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20260319061723.1140720-7-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>master
parent
39afaff983
commit
ca5ff14c1a
|
|
@ -10,4 +10,4 @@ obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
|
|||
|
||||
aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
|
||||
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
|
||||
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
|
||||
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o ghash-neon-core.o
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
* Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
|
||||
* Accelerated AES-GCM implementation with ARMv8 Crypto Extensions.
|
||||
*
|
||||
* Copyright (C) 2015 - 2017 Linaro Ltd.
|
||||
* Copyright (C) 2023 Google LLC. <ardb@google.com>
|
||||
|
|
@ -29,39 +29,10 @@
|
|||
XM_H .req d7
|
||||
XH_L .req d8
|
||||
|
||||
t0l .req d10
|
||||
t0h .req d11
|
||||
t1l .req d12
|
||||
t1h .req d13
|
||||
t2l .req d14
|
||||
t2h .req d15
|
||||
t3l .req d16
|
||||
t3h .req d17
|
||||
t4l .req d18
|
||||
t4h .req d19
|
||||
|
||||
t0q .req q5
|
||||
t1q .req q6
|
||||
t2q .req q7
|
||||
t3q .req q8
|
||||
t4q .req q9
|
||||
XH2 .req q9
|
||||
|
||||
s1l .req d20
|
||||
s1h .req d21
|
||||
s2l .req d22
|
||||
s2h .req d23
|
||||
s3l .req d24
|
||||
s3h .req d25
|
||||
s4l .req d26
|
||||
s4h .req d27
|
||||
|
||||
MASK .req d28
|
||||
SHASH2_p8 .req d28
|
||||
|
||||
k16 .req d29
|
||||
k32 .req d30
|
||||
k48 .req d31
|
||||
SHASH2_p64 .req d31
|
||||
|
||||
HH .req q10
|
||||
|
|
@ -93,72 +64,6 @@
|
|||
|
||||
.text
|
||||
|
||||
.macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
|
||||
vmull.p64 \rd, \rn, \rm
|
||||
.endm
|
||||
|
||||
/*
|
||||
* This implementation of 64x64 -> 128 bit polynomial multiplication
|
||||
* using vmull.p8 instructions (8x8 -> 16) is taken from the paper
|
||||
* "Fast Software Polynomial Multiplication on ARM Processors Using
|
||||
* the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
|
||||
* Ricardo Dahab (https://hal.inria.fr/hal-01506572)
|
||||
*
|
||||
* It has been slightly tweaked for in-order performance, and to allow
|
||||
* 'rq' to overlap with 'ad' or 'bd'.
|
||||
*/
|
||||
.macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
|
||||
vext.8 t0l, \ad, \ad, #1 @ A1
|
||||
.ifc \b1, t4l
|
||||
vext.8 t4l, \bd, \bd, #1 @ B1
|
||||
.endif
|
||||
vmull.p8 t0q, t0l, \bd @ F = A1*B
|
||||
vext.8 t1l, \ad, \ad, #2 @ A2
|
||||
vmull.p8 t4q, \ad, \b1 @ E = A*B1
|
||||
.ifc \b2, t3l
|
||||
vext.8 t3l, \bd, \bd, #2 @ B2
|
||||
.endif
|
||||
vmull.p8 t1q, t1l, \bd @ H = A2*B
|
||||
vext.8 t2l, \ad, \ad, #3 @ A3
|
||||
vmull.p8 t3q, \ad, \b2 @ G = A*B2
|
||||
veor t0q, t0q, t4q @ L = E + F
|
||||
.ifc \b3, t4l
|
||||
vext.8 t4l, \bd, \bd, #3 @ B3
|
||||
.endif
|
||||
vmull.p8 t2q, t2l, \bd @ J = A3*B
|
||||
veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
|
||||
veor t1q, t1q, t3q @ M = G + H
|
||||
.ifc \b4, t3l
|
||||
vext.8 t3l, \bd, \bd, #4 @ B4
|
||||
.endif
|
||||
vmull.p8 t4q, \ad, \b3 @ I = A*B3
|
||||
veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
|
||||
vmull.p8 t3q, \ad, \b4 @ K = A*B4
|
||||
vand t0h, t0h, k48
|
||||
vand t1h, t1h, k32
|
||||
veor t2q, t2q, t4q @ N = I + J
|
||||
veor t0l, t0l, t0h
|
||||
veor t1l, t1l, t1h
|
||||
veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
|
||||
vand t2h, t2h, k16
|
||||
veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
|
||||
vmov.i64 t3h, #0
|
||||
vext.8 t0q, t0q, t0q, #15
|
||||
veor t2l, t2l, t2h
|
||||
vext.8 t1q, t1q, t1q, #14
|
||||
vmull.p8 \rq, \ad, \bd @ D = A*B
|
||||
vext.8 t2q, t2q, t2q, #13
|
||||
vext.8 t3q, t3q, t3q, #12
|
||||
veor t0q, t0q, t1q
|
||||
veor t2q, t2q, t3q
|
||||
veor \rq, \rq, t0q
|
||||
veor \rq, \rq, t2q
|
||||
.endm
|
||||
|
||||
//
|
||||
// PMULL (64x64->128) based reduction for CPUs that can do
|
||||
// it in a single instruction.
|
||||
//
|
||||
.macro __pmull_reduce_p64
|
||||
vmull.p64 T1, XL_L, MASK
|
||||
|
||||
|
|
@ -170,30 +75,7 @@
|
|||
vmull.p64 XL, T1_H, MASK
|
||||
.endm
|
||||
|
||||
//
|
||||
// Alternative reduction for CPUs that lack support for the
|
||||
// 64x64->128 PMULL instruction
|
||||
//
|
||||
.macro __pmull_reduce_p8
|
||||
veor XL_H, XL_H, XM_L
|
||||
veor XH_L, XH_L, XM_H
|
||||
|
||||
vshl.i64 T1, XL, #57
|
||||
vshl.i64 T2, XL, #62
|
||||
veor T1, T1, T2
|
||||
vshl.i64 T2, XL, #63
|
||||
veor T1, T1, T2
|
||||
veor XL_H, XL_H, T1_L
|
||||
veor XH_L, XH_L, T1_H
|
||||
|
||||
vshr.u64 T1, XL, #1
|
||||
veor XH, XH, XL
|
||||
veor XL, XL, T1
|
||||
vshr.u64 T1, T1, #6
|
||||
vshr.u64 XL, XL, #1
|
||||
.endm
|
||||
|
||||
.macro ghash_update, pn, enc, aggregate=1, head=1
|
||||
.macro ghash_update, enc, aggregate=1, head=1
|
||||
vld1.64 {XL}, [r1]
|
||||
|
||||
.if \head
|
||||
|
|
@ -206,8 +88,7 @@
|
|||
b 3f
|
||||
.endif
|
||||
|
||||
0: .ifc \pn, p64
|
||||
.if \aggregate
|
||||
0: .if \aggregate
|
||||
tst r0, #3 // skip until #blocks is a
|
||||
bne 2f // round multiple of 4
|
||||
|
||||
|
|
@ -288,7 +169,6 @@
|
|||
|
||||
b 1b
|
||||
.endif
|
||||
.endif
|
||||
|
||||
2: vld1.8 {T1}, [r2]!
|
||||
|
||||
|
|
@ -308,15 +188,15 @@
|
|||
veor T1_L, T1_L, XL_H
|
||||
veor XL, XL, IN1
|
||||
|
||||
__pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
|
||||
vmull.p64 XH, XL_H, SHASH_H @ a1 * b1
|
||||
veor T1, T1, XL
|
||||
__pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
|
||||
__pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
|
||||
vmull.p64 XL, XL_L, SHASH_L @ a0 * b0
|
||||
vmull.p64 XM, T1_L, SHASH2_p64 @ (a1+a0)(b1+b0)
|
||||
|
||||
4: veor T1, XL, XH
|
||||
veor XM, XM, T1
|
||||
|
||||
__pmull_reduce_\pn
|
||||
__pmull_reduce_p64
|
||||
|
||||
veor T1, T1, XH
|
||||
veor XL, XL, T1
|
||||
|
|
@ -325,8 +205,8 @@
|
|||
.endm
|
||||
|
||||
/*
|
||||
* void pmull_ghash_update(int blocks, u64 dg[], const char *src,
|
||||
* struct ghash_key const *k, const char *head)
|
||||
* void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
|
||||
* u64 const h[4][2], const char *head)
|
||||
*/
|
||||
ENTRY(pmull_ghash_update_p64)
|
||||
vld1.64 {SHASH}, [r3]!
|
||||
|
|
@ -341,35 +221,12 @@ ENTRY(pmull_ghash_update_p64)
|
|||
vmov.i8 MASK, #0xe1
|
||||
vshl.u64 MASK, MASK, #57
|
||||
|
||||
ghash_update p64
|
||||
ghash_update
|
||||
vst1.64 {XL}, [r1]
|
||||
|
||||
bx lr
|
||||
ENDPROC(pmull_ghash_update_p64)
|
||||
|
||||
ENTRY(pmull_ghash_update_p8)
|
||||
vld1.64 {SHASH}, [r3]
|
||||
veor SHASH2_p8, SHASH_L, SHASH_H
|
||||
|
||||
vext.8 s1l, SHASH_L, SHASH_L, #1
|
||||
vext.8 s2l, SHASH_L, SHASH_L, #2
|
||||
vext.8 s3l, SHASH_L, SHASH_L, #3
|
||||
vext.8 s4l, SHASH_L, SHASH_L, #4
|
||||
vext.8 s1h, SHASH_H, SHASH_H, #1
|
||||
vext.8 s2h, SHASH_H, SHASH_H, #2
|
||||
vext.8 s3h, SHASH_H, SHASH_H, #3
|
||||
vext.8 s4h, SHASH_H, SHASH_H, #4
|
||||
|
||||
vmov.i64 k16, #0xffff
|
||||
vmov.i64 k32, #0xffffffff
|
||||
vmov.i64 k48, #0xffffffffffff
|
||||
|
||||
ghash_update p8
|
||||
vst1.64 {XL}, [r1]
|
||||
|
||||
bx lr
|
||||
ENDPROC(pmull_ghash_update_p8)
|
||||
|
||||
e0 .req q9
|
||||
e1 .req q10
|
||||
e2 .req q11
|
||||
|
|
@ -536,7 +393,7 @@ ENTRY(pmull_gcm_encrypt)
|
|||
|
||||
vld1.64 {SHASH}, [r3]
|
||||
|
||||
ghash_update p64, enc, head=0
|
||||
ghash_update enc, head=0
|
||||
vst1.64 {XL}, [r1]
|
||||
|
||||
pop {r4-r8, pc}
|
||||
|
|
@ -554,7 +411,7 @@ ENTRY(pmull_gcm_decrypt)
|
|||
|
||||
vld1.64 {SHASH}, [r3]
|
||||
|
||||
ghash_update p64, dec, head=0
|
||||
ghash_update dec, head=0
|
||||
vst1.64 {XL}, [r1]
|
||||
|
||||
pop {r4-r8, pc}
|
||||
|
|
@ -603,7 +460,7 @@ ENTRY(pmull_gcm_enc_final)
|
|||
vshl.u64 MASK, MASK, #57
|
||||
mov r0, #1
|
||||
bne 3f // process head block first
|
||||
ghash_update p64, aggregate=0, head=0
|
||||
ghash_update aggregate=0, head=0
|
||||
|
||||
vrev64.8 XL, XL
|
||||
vext.8 XL, XL, XL, #8
|
||||
|
|
@ -660,7 +517,7 @@ ENTRY(pmull_gcm_dec_final)
|
|||
vshl.u64 MASK, MASK, #57
|
||||
mov r0, #1
|
||||
bne 3f // process head block first
|
||||
ghash_update p64, aggregate=0, head=0
|
||||
ghash_update aggregate=0, head=0
|
||||
|
||||
vrev64.8 XL, XL
|
||||
vext.8 XL, XL, XL, #8
|
||||
|
|
|
|||
|
|
@ -0,0 +1,207 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
* Accelerated GHASH implementation with NEON vmull.p8 instructions.
|
||||
*
|
||||
* Copyright (C) 2015 - 2017 Linaro Ltd.
|
||||
* Copyright (C) 2023 Google LLC. <ardb@google.com>
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/assembler.h>
|
||||
|
||||
.fpu neon
|
||||
|
||||
SHASH .req q0
|
||||
T1 .req q1
|
||||
XL .req q2
|
||||
XM .req q3
|
||||
XH .req q4
|
||||
IN1 .req q4
|
||||
|
||||
SHASH_L .req d0
|
||||
SHASH_H .req d1
|
||||
T1_L .req d2
|
||||
T1_H .req d3
|
||||
XL_L .req d4
|
||||
XL_H .req d5
|
||||
XM_L .req d6
|
||||
XM_H .req d7
|
||||
XH_L .req d8
|
||||
|
||||
t0l .req d10
|
||||
t0h .req d11
|
||||
t1l .req d12
|
||||
t1h .req d13
|
||||
t2l .req d14
|
||||
t2h .req d15
|
||||
t3l .req d16
|
||||
t3h .req d17
|
||||
t4l .req d18
|
||||
t4h .req d19
|
||||
|
||||
t0q .req q5
|
||||
t1q .req q6
|
||||
t2q .req q7
|
||||
t3q .req q8
|
||||
t4q .req q9
|
||||
|
||||
s1l .req d20
|
||||
s1h .req d21
|
||||
s2l .req d22
|
||||
s2h .req d23
|
||||
s3l .req d24
|
||||
s3h .req d25
|
||||
s4l .req d26
|
||||
s4h .req d27
|
||||
|
||||
SHASH2_p8 .req d28
|
||||
|
||||
k16 .req d29
|
||||
k32 .req d30
|
||||
k48 .req d31
|
||||
|
||||
T2 .req q7
|
||||
|
||||
.text
|
||||
|
||||
/*
|
||||
* This implementation of 64x64 -> 128 bit polynomial multiplication
|
||||
* using vmull.p8 instructions (8x8 -> 16) is taken from the paper
|
||||
* "Fast Software Polynomial Multiplication on ARM Processors Using
|
||||
* the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
|
||||
* Ricardo Dahab (https://hal.inria.fr/hal-01506572)
|
||||
*
|
||||
* It has been slightly tweaked for in-order performance, and to allow
|
||||
* 'rq' to overlap with 'ad' or 'bd'.
|
||||
*/
|
||||
.macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
|
||||
vext.8 t0l, \ad, \ad, #1 @ A1
|
||||
.ifc \b1, t4l
|
||||
vext.8 t4l, \bd, \bd, #1 @ B1
|
||||
.endif
|
||||
vmull.p8 t0q, t0l, \bd @ F = A1*B
|
||||
vext.8 t1l, \ad, \ad, #2 @ A2
|
||||
vmull.p8 t4q, \ad, \b1 @ E = A*B1
|
||||
.ifc \b2, t3l
|
||||
vext.8 t3l, \bd, \bd, #2 @ B2
|
||||
.endif
|
||||
vmull.p8 t1q, t1l, \bd @ H = A2*B
|
||||
vext.8 t2l, \ad, \ad, #3 @ A3
|
||||
vmull.p8 t3q, \ad, \b2 @ G = A*B2
|
||||
veor t0q, t0q, t4q @ L = E + F
|
||||
.ifc \b3, t4l
|
||||
vext.8 t4l, \bd, \bd, #3 @ B3
|
||||
.endif
|
||||
vmull.p8 t2q, t2l, \bd @ J = A3*B
|
||||
veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
|
||||
veor t1q, t1q, t3q @ M = G + H
|
||||
.ifc \b4, t3l
|
||||
vext.8 t3l, \bd, \bd, #4 @ B4
|
||||
.endif
|
||||
vmull.p8 t4q, \ad, \b3 @ I = A*B3
|
||||
veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
|
||||
vmull.p8 t3q, \ad, \b4 @ K = A*B4
|
||||
vand t0h, t0h, k48
|
||||
vand t1h, t1h, k32
|
||||
veor t2q, t2q, t4q @ N = I + J
|
||||
veor t0l, t0l, t0h
|
||||
veor t1l, t1l, t1h
|
||||
veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
|
||||
vand t2h, t2h, k16
|
||||
veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
|
||||
vmov.i64 t3h, #0
|
||||
vext.8 t0q, t0q, t0q, #15
|
||||
veor t2l, t2l, t2h
|
||||
vext.8 t1q, t1q, t1q, #14
|
||||
vmull.p8 \rq, \ad, \bd @ D = A*B
|
||||
vext.8 t2q, t2q, t2q, #13
|
||||
vext.8 t3q, t3q, t3q, #12
|
||||
veor t0q, t0q, t1q
|
||||
veor t2q, t2q, t3q
|
||||
veor \rq, \rq, t0q
|
||||
veor \rq, \rq, t2q
|
||||
.endm
|
||||
|
||||
.macro __pmull_reduce_p8
|
||||
veor XL_H, XL_H, XM_L
|
||||
veor XH_L, XH_L, XM_H
|
||||
|
||||
vshl.i64 T1, XL, #57
|
||||
vshl.i64 T2, XL, #62
|
||||
veor T1, T1, T2
|
||||
vshl.i64 T2, XL, #63
|
||||
veor T1, T1, T2
|
||||
veor XL_H, XL_H, T1_L
|
||||
veor XH_L, XH_L, T1_H
|
||||
|
||||
vshr.u64 T1, XL, #1
|
||||
veor XH, XH, XL
|
||||
veor XL, XL, T1
|
||||
vshr.u64 T1, T1, #6
|
||||
vshr.u64 XL, XL, #1
|
||||
.endm
|
||||
|
||||
.macro ghash_update
|
||||
vld1.64 {XL}, [r1]
|
||||
|
||||
/* do the head block first, if supplied */
|
||||
ldr ip, [sp]
|
||||
teq ip, #0
|
||||
beq 0f
|
||||
vld1.64 {T1}, [ip]
|
||||
teq r0, #0
|
||||
b 3f
|
||||
|
||||
0:
|
||||
vld1.8 {T1}, [r2]!
|
||||
subs r0, r0, #1
|
||||
|
||||
3: /* multiply XL by SHASH in GF(2^128) */
|
||||
vrev64.8 T1, T1
|
||||
|
||||
vext.8 IN1, T1, T1, #8
|
||||
veor T1_L, T1_L, XL_H
|
||||
veor XL, XL, IN1
|
||||
|
||||
__pmull_p8 XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
|
||||
veor T1, T1, XL
|
||||
__pmull_p8 XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
|
||||
__pmull_p8 XM, T1_L, SHASH2_p8 @ (a1+a0)(b1+b0)
|
||||
|
||||
veor T1, XL, XH
|
||||
veor XM, XM, T1
|
||||
|
||||
__pmull_reduce_p8
|
||||
|
||||
veor T1, T1, XH
|
||||
veor XL, XL, T1
|
||||
|
||||
bne 0b
|
||||
.endm
|
||||
|
||||
/*
|
||||
* void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
|
||||
* u64 const h[1][2], const char *head)
|
||||
*/
|
||||
ENTRY(pmull_ghash_update_p8)
|
||||
vld1.64 {SHASH}, [r3]
|
||||
veor SHASH2_p8, SHASH_L, SHASH_H
|
||||
|
||||
vext.8 s1l, SHASH_L, SHASH_L, #1
|
||||
vext.8 s2l, SHASH_L, SHASH_L, #2
|
||||
vext.8 s3l, SHASH_L, SHASH_L, #3
|
||||
vext.8 s4l, SHASH_L, SHASH_L, #4
|
||||
vext.8 s1h, SHASH_H, SHASH_H, #1
|
||||
vext.8 s2h, SHASH_H, SHASH_H, #2
|
||||
vext.8 s3h, SHASH_H, SHASH_H, #3
|
||||
vext.8 s4h, SHASH_H, SHASH_H, #4
|
||||
|
||||
vmov.i64 k16, #0xffff
|
||||
vmov.i64 k32, #0xffffffff
|
||||
vmov.i64 k48, #0xffffffffffff
|
||||
|
||||
ghash_update
|
||||
vst1.64 {XL}, [r1]
|
||||
|
||||
bx lr
|
||||
ENDPROC(pmull_ghash_update_p8)
|
||||
Loading…
Reference in New Issue