lib/crypto: riscv/poly1305: Import OpenSSL/CRYPTOGAMS implementation
This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation for riscv authored by Andy Polyakov. The file 'poly1305-riscv.pl' is taken straight from https://github.com/dot-asm/cryptogams commit 5e3fba73576244708a752fa61a8e93e587f271bb. This patch was tested on SpacemiT X60, with 2~2.5x improvement over generic implementation. Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn> Signed-off-by: Zhihang Shao <zhihang.shao.iscas@gmail.com> [EB: ported to lib/crypto/riscv/] Reviewed-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20250829152513.92459-4-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>pull/1354/merge
parent
b646b782e5
commit
bef9c75598
|
|
@ -128,6 +128,7 @@ config CRYPTO_LIB_POLY1305_ARCH
|
|||
default y if MIPS
|
||||
# The PPC64 code needs to be fixed to work in softirq context.
|
||||
default y if PPC64 && CPU_LITTLE_ENDIAN && VSX && BROKEN
|
||||
default y if RISCV
|
||||
default y if X86_64
|
||||
|
||||
# This symbol controls the inclusion of the Poly1305 generic code. This differs
|
||||
|
|
@ -143,7 +144,7 @@ config CRYPTO_LIB_POLY1305_GENERIC
|
|||
|
||||
config CRYPTO_LIB_POLY1305_RSIZE
|
||||
int
|
||||
default 2 if MIPS
|
||||
default 2 if MIPS || RISCV
|
||||
default 11 if X86_64
|
||||
default 9 if ARM || ARM64
|
||||
default 1
|
||||
|
|
|
|||
|
|
@ -112,6 +112,19 @@ endif
|
|||
|
||||
libpoly1305-$(CONFIG_PPC) += powerpc/poly1305-p10le_64.o
|
||||
|
||||
ifeq ($(CONFIG_RISCV),y)
|
||||
libpoly1305-y += riscv/poly1305-core.o
|
||||
poly1305-perlasm-flavour-$(CONFIG_32BIT) := 32
|
||||
poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64
|
||||
quiet_cmd_perlasm_poly1305 = PERLASM $@
|
||||
cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@
|
||||
# Use if_changed instead of cmd, in case the flavour changed.
|
||||
$(obj)/riscv/poly1305-core.S: $(src)/riscv/poly1305-riscv.pl FORCE
|
||||
$(call if_changed,perlasm_poly1305)
|
||||
targets += riscv/poly1305-core.S
|
||||
AFLAGS_riscv/poly1305-core.o += -Dpoly1305_init=poly1305_block_init
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_X86),y)
|
||||
libpoly1305-y += x86/poly1305-x86_64-cryptogams.o
|
||||
$(obj)/x86/poly1305-x86_64-cryptogams.S: $(src)/x86/poly1305-x86_64-cryptogams.pl
|
||||
|
|
@ -124,6 +137,7 @@ endif # CONFIG_CRYPTO_LIB_POLY1305_ARCH
|
|||
clean-files += arm/poly1305-core.S \
|
||||
arm64/poly1305-core.S \
|
||||
mips/poly1305-core.S \
|
||||
riscv/poly1305-core.S \
|
||||
x86/poly1305-x86_64-cryptogams.S
|
||||
|
||||
################################################################################
|
||||
|
|
|
|||
|
|
@ -0,0 +1,847 @@
|
|||
#!/usr/bin/env perl
|
||||
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL.
|
||||
# ====================================================================
|
||||
#
|
||||
# Poly1305 hash for RISC-V.
|
||||
#
|
||||
# February 2019
|
||||
#
|
||||
# In the essence it's pretty straightforward transliteration of MIPS
|
||||
# module [without big-endian option].
|
||||
#
|
||||
# 1.8 cycles per byte on U74, >100% faster than compiler-generated
|
||||
# code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69%
|
||||
# improvement.
|
||||
#
|
||||
# June 2024.
|
||||
#
|
||||
# Add CHERI support.
|
||||
#
|
||||
######################################################################
|
||||
#
|
||||
($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4));
|
||||
($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31));
|
||||
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17));
|
||||
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27));
|
||||
#
|
||||
######################################################################
|
||||
|
||||
$flavour = shift || "64";
|
||||
|
||||
for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
|
||||
open STDOUT,">$output";
|
||||
|
||||
$code.=<<___;
|
||||
#ifdef __KERNEL__
|
||||
# ifdef __riscv_zicfilp
|
||||
# undef __riscv_zicfilp // calls are expected to be direct
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast)
|
||||
# define __riscv_misaligned_fast 1
|
||||
#endif
|
||||
___
|
||||
|
||||
if ($flavour =~ /64/) {{{
|
||||
######################################################################
|
||||
# 64-bit code path...
|
||||
#
|
||||
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
|
||||
my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2);
|
||||
|
||||
$code.=<<___;
|
||||
#if __riscv_xlen == 64
|
||||
# if __SIZEOF_POINTER__ == 16
|
||||
# define PUSH csc
|
||||
# define POP clc
|
||||
# else
|
||||
# define PUSH sd
|
||||
# define POP ld
|
||||
# endif
|
||||
#else
|
||||
# error "unsupported __riscv_xlen"
|
||||
#endif
|
||||
|
||||
.option pic
|
||||
.text
|
||||
|
||||
.globl poly1305_init
|
||||
.type poly1305_init,\@function
|
||||
poly1305_init:
|
||||
#ifdef __riscv_zicfilp
|
||||
lpad 0
|
||||
#endif
|
||||
sd $zero,0($ctx)
|
||||
sd $zero,8($ctx)
|
||||
sd $zero,16($ctx)
|
||||
|
||||
beqz $inp,.Lno_key
|
||||
|
||||
#ifndef __riscv_misaligned_fast
|
||||
andi $tmp0,$inp,7 # $inp % 8
|
||||
andi $inp,$inp,-8 # align $inp
|
||||
slli $tmp0,$tmp0,3 # byte to bit offset
|
||||
#endif
|
||||
ld $in0,0($inp)
|
||||
ld $in1,8($inp)
|
||||
#ifndef __riscv_misaligned_fast
|
||||
beqz $tmp0,.Laligned_key
|
||||
|
||||
ld $tmp2,16($inp)
|
||||
neg $tmp1,$tmp0 # implicit &63 in sll
|
||||
srl $in0,$in0,$tmp0
|
||||
sll $tmp3,$in1,$tmp1
|
||||
srl $in1,$in1,$tmp0
|
||||
sll $tmp2,$tmp2,$tmp1
|
||||
or $in0,$in0,$tmp3
|
||||
or $in1,$in1,$tmp2
|
||||
|
||||
.Laligned_key:
|
||||
#endif
|
||||
li $tmp0,1
|
||||
slli $tmp0,$tmp0,32 # 0x0000000100000000
|
||||
addi $tmp0,$tmp0,-63 # 0x00000000ffffffc1
|
||||
slli $tmp0,$tmp0,28 # 0x0ffffffc10000000
|
||||
addi $tmp0,$tmp0,-1 # 0x0ffffffc0fffffff
|
||||
|
||||
and $in0,$in0,$tmp0
|
||||
addi $tmp0,$tmp0,-3 # 0x0ffffffc0ffffffc
|
||||
and $in1,$in1,$tmp0
|
||||
|
||||
sd $in0,24($ctx)
|
||||
srli $tmp0,$in1,2
|
||||
sd $in1,32($ctx)
|
||||
add $tmp0,$tmp0,$in1 # s1 = r1 + (r1 >> 2)
|
||||
sd $tmp0,40($ctx)
|
||||
|
||||
.Lno_key:
|
||||
li $a0,0 # return 0
|
||||
ret
|
||||
.size poly1305_init,.-poly1305_init
|
||||
___
|
||||
{
|
||||
my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
|
||||
($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2);
|
||||
my ($shr,$shl) = ($t5,$t6); # used on R6
|
||||
|
||||
$code.=<<___;
|
||||
.globl poly1305_blocks
|
||||
.type poly1305_blocks,\@function
|
||||
poly1305_blocks:
|
||||
#ifdef __riscv_zicfilp
|
||||
lpad 0
|
||||
#endif
|
||||
andi $len,$len,-16 # complete blocks only
|
||||
beqz $len,.Lno_data
|
||||
|
||||
caddi $sp,$sp,-4*__SIZEOF_POINTER__
|
||||
PUSH $s0,3*__SIZEOF_POINTER__($sp)
|
||||
PUSH $s1,2*__SIZEOF_POINTER__($sp)
|
||||
PUSH $s2,1*__SIZEOF_POINTER__($sp)
|
||||
PUSH $s3,0*__SIZEOF_POINTER__($sp)
|
||||
|
||||
#ifndef __riscv_misaligned_fast
|
||||
andi $shr,$inp,7
|
||||
andi $inp,$inp,-8 # align $inp
|
||||
slli $shr,$shr,3 # byte to bit offset
|
||||
neg $shl,$shr # implicit &63 in sll
|
||||
#endif
|
||||
|
||||
ld $h0,0($ctx) # load hash value
|
||||
ld $h1,8($ctx)
|
||||
ld $h2,16($ctx)
|
||||
|
||||
ld $r0,24($ctx) # load key
|
||||
ld $r1,32($ctx)
|
||||
ld $rs1,40($ctx)
|
||||
|
||||
add $len,$len,$inp # end of buffer
|
||||
|
||||
.Loop:
|
||||
ld $in0,0($inp) # load input
|
||||
ld $in1,8($inp)
|
||||
#ifndef __riscv_misaligned_fast
|
||||
beqz $shr,.Laligned_inp
|
||||
|
||||
ld $tmp2,16($inp)
|
||||
srl $in0,$in0,$shr
|
||||
sll $tmp3,$in1,$shl
|
||||
srl $in1,$in1,$shr
|
||||
sll $tmp2,$tmp2,$shl
|
||||
or $in0,$in0,$tmp3
|
||||
or $in1,$in1,$tmp2
|
||||
|
||||
.Laligned_inp:
|
||||
#endif
|
||||
caddi $inp,$inp,16
|
||||
|
||||
andi $tmp0,$h2,-4 # modulo-scheduled reduction
|
||||
srli $tmp1,$h2,2
|
||||
andi $h2,$h2,3
|
||||
|
||||
add $d0,$h0,$in0 # accumulate input
|
||||
add $tmp1,$tmp1,$tmp0
|
||||
sltu $tmp0,$d0,$h0
|
||||
add $d0,$d0,$tmp1 # ... and residue
|
||||
sltu $tmp1,$d0,$tmp1
|
||||
add $d1,$h1,$in1
|
||||
add $tmp0,$tmp0,$tmp1
|
||||
sltu $tmp1,$d1,$h1
|
||||
add $d1,$d1,$tmp0
|
||||
|
||||
add $d2,$h2,$padbit
|
||||
sltu $tmp0,$d1,$tmp0
|
||||
mulhu $h1,$r0,$d0 # h0*r0
|
||||
mul $h0,$r0,$d0
|
||||
|
||||
add $d2,$d2,$tmp1
|
||||
add $d2,$d2,$tmp0
|
||||
mulhu $tmp1,$rs1,$d1 # h1*5*r1
|
||||
mul $tmp0,$rs1,$d1
|
||||
|
||||
mulhu $h2,$r1,$d0 # h0*r1
|
||||
mul $tmp2,$r1,$d0
|
||||
add $h0,$h0,$tmp0
|
||||
add $h1,$h1,$tmp1
|
||||
sltu $tmp0,$h0,$tmp0
|
||||
|
||||
add $h1,$h1,$tmp0
|
||||
add $h1,$h1,$tmp2
|
||||
mulhu $tmp1,$r0,$d1 # h1*r0
|
||||
mul $tmp0,$r0,$d1
|
||||
|
||||
sltu $tmp2,$h1,$tmp2
|
||||
add $h2,$h2,$tmp2
|
||||
mul $tmp2,$rs1,$d2 # h2*5*r1
|
||||
|
||||
add $h1,$h1,$tmp0
|
||||
add $h2,$h2,$tmp1
|
||||
mul $tmp3,$r0,$d2 # h2*r0
|
||||
sltu $tmp0,$h1,$tmp0
|
||||
add $h2,$h2,$tmp0
|
||||
|
||||
add $h1,$h1,$tmp2
|
||||
sltu $tmp2,$h1,$tmp2
|
||||
add $h2,$h2,$tmp2
|
||||
add $h2,$h2,$tmp3
|
||||
|
||||
bne $inp,$len,.Loop
|
||||
|
||||
sd $h0,0($ctx) # store hash value
|
||||
sd $h1,8($ctx)
|
||||
sd $h2,16($ctx)
|
||||
|
||||
POP $s0,3*__SIZEOF_POINTER__($sp) # epilogue
|
||||
POP $s1,2*__SIZEOF_POINTER__($sp)
|
||||
POP $s2,1*__SIZEOF_POINTER__($sp)
|
||||
POP $s3,0*__SIZEOF_POINTER__($sp)
|
||||
caddi $sp,$sp,4*__SIZEOF_POINTER__
|
||||
|
||||
.Lno_data:
|
||||
ret
|
||||
.size poly1305_blocks,.-poly1305_blocks
|
||||
___
|
||||
}
|
||||
{
|
||||
my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
|
||||
|
||||
$code.=<<___;
|
||||
.globl poly1305_emit
|
||||
.type poly1305_emit,\@function
|
||||
poly1305_emit:
|
||||
#ifdef __riscv_zicfilp
|
||||
lpad 0
|
||||
#endif
|
||||
ld $tmp2,16($ctx)
|
||||
ld $tmp0,0($ctx)
|
||||
ld $tmp1,8($ctx)
|
||||
|
||||
andi $in0,$tmp2,-4 # final reduction
|
||||
srl $in1,$tmp2,2
|
||||
andi $tmp2,$tmp2,3
|
||||
add $in0,$in0,$in1
|
||||
|
||||
add $tmp0,$tmp0,$in0
|
||||
sltu $in1,$tmp0,$in0
|
||||
addi $in0,$tmp0,5 # compare to modulus
|
||||
add $tmp1,$tmp1,$in1
|
||||
sltiu $tmp3,$in0,5
|
||||
sltu $tmp4,$tmp1,$in1
|
||||
add $in1,$tmp1,$tmp3
|
||||
add $tmp2,$tmp2,$tmp4
|
||||
sltu $tmp3,$in1,$tmp3
|
||||
add $tmp2,$tmp2,$tmp3
|
||||
|
||||
srli $tmp2,$tmp2,2 # see if it carried/borrowed
|
||||
neg $tmp2,$tmp2
|
||||
|
||||
xor $in0,$in0,$tmp0
|
||||
xor $in1,$in1,$tmp1
|
||||
and $in0,$in0,$tmp2
|
||||
and $in1,$in1,$tmp2
|
||||
xor $in0,$in0,$tmp0
|
||||
xor $in1,$in1,$tmp1
|
||||
|
||||
lwu $tmp0,0($nonce) # load nonce
|
||||
lwu $tmp1,4($nonce)
|
||||
lwu $tmp2,8($nonce)
|
||||
lwu $tmp3,12($nonce)
|
||||
slli $tmp1,$tmp1,32
|
||||
slli $tmp3,$tmp3,32
|
||||
or $tmp0,$tmp0,$tmp1
|
||||
or $tmp2,$tmp2,$tmp3
|
||||
|
||||
add $in0,$in0,$tmp0 # accumulate nonce
|
||||
add $in1,$in1,$tmp2
|
||||
sltu $tmp0,$in0,$tmp0
|
||||
add $in1,$in1,$tmp0
|
||||
|
||||
#ifdef __riscv_misaligned_fast
|
||||
sd $in0,0($mac) # write mac value
|
||||
sd $in1,8($mac)
|
||||
#else
|
||||
srli $tmp0,$in0,8 # write mac value
|
||||
srli $tmp1,$in0,16
|
||||
srli $tmp2,$in0,24
|
||||
sb $in0,0($mac)
|
||||
srli $tmp3,$in0,32
|
||||
sb $tmp0,1($mac)
|
||||
srli $tmp0,$in0,40
|
||||
sb $tmp1,2($mac)
|
||||
srli $tmp1,$in0,48
|
||||
sb $tmp2,3($mac)
|
||||
srli $tmp2,$in0,56
|
||||
sb $tmp3,4($mac)
|
||||
srli $tmp3,$in1,8
|
||||
sb $tmp0,5($mac)
|
||||
srli $tmp0,$in1,16
|
||||
sb $tmp1,6($mac)
|
||||
srli $tmp1,$in1,24
|
||||
sb $tmp2,7($mac)
|
||||
|
||||
sb $in1,8($mac)
|
||||
srli $tmp2,$in1,32
|
||||
sb $tmp3,9($mac)
|
||||
srli $tmp3,$in1,40
|
||||
sb $tmp0,10($mac)
|
||||
srli $tmp0,$in1,48
|
||||
sb $tmp1,11($mac)
|
||||
srli $tmp1,$in1,56
|
||||
sb $tmp2,12($mac)
|
||||
sb $tmp3,13($mac)
|
||||
sb $tmp0,14($mac)
|
||||
sb $tmp1,15($mac)
|
||||
#endif
|
||||
|
||||
ret
|
||||
.size poly1305_emit,.-poly1305_emit
|
||||
.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
|
||||
___
|
||||
}
|
||||
}}} else {{{
|
||||
######################################################################
|
||||
# 32-bit code path
|
||||
#
|
||||
|
||||
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
|
||||
my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
|
||||
($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3);
|
||||
|
||||
$code.=<<___;
|
||||
#if __riscv_xlen == 32
|
||||
# if __SIZEOF_POINTER__ == 8
|
||||
# define PUSH csc
|
||||
# define POP clc
|
||||
# else
|
||||
# define PUSH sw
|
||||
# define POP lw
|
||||
# endif
|
||||
# define MULX(hi,lo,a,b) mulhu hi,a,b; mul lo,a,b
|
||||
# define srliw srli
|
||||
# define srlw srl
|
||||
# define sllw sll
|
||||
# define addw add
|
||||
# define addiw addi
|
||||
# define mulw mul
|
||||
#elif __riscv_xlen == 64
|
||||
# if __SIZEOF_POINTER__ == 16
|
||||
# define PUSH csc
|
||||
# define POP clc
|
||||
# else
|
||||
# define PUSH sd
|
||||
# define POP ld
|
||||
# endif
|
||||
# define MULX(hi,lo,a,b) slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32
|
||||
#else
|
||||
# error "unsupported __riscv_xlen"
|
||||
#endif
|
||||
|
||||
.option pic
|
||||
.text
|
||||
|
||||
.globl poly1305_init
|
||||
.type poly1305_init,\@function
|
||||
poly1305_init:
|
||||
#ifdef __riscv_zicfilp
|
||||
lpad 0
|
||||
#endif
|
||||
sw $zero,0($ctx)
|
||||
sw $zero,4($ctx)
|
||||
sw $zero,8($ctx)
|
||||
sw $zero,12($ctx)
|
||||
sw $zero,16($ctx)
|
||||
|
||||
beqz $inp,.Lno_key
|
||||
|
||||
#ifndef __riscv_misaligned_fast
|
||||
andi $tmp0,$inp,3 # $inp % 4
|
||||
sub $inp,$inp,$tmp0 # align $inp
|
||||
sll $tmp0,$tmp0,3 # byte to bit offset
|
||||
#endif
|
||||
lw $in0,0($inp)
|
||||
lw $in1,4($inp)
|
||||
lw $in2,8($inp)
|
||||
lw $in3,12($inp)
|
||||
#ifndef __riscv_misaligned_fast
|
||||
beqz $tmp0,.Laligned_key
|
||||
|
||||
lw $tmp2,16($inp)
|
||||
sub $tmp1,$zero,$tmp0
|
||||
srlw $in0,$in0,$tmp0
|
||||
sllw $tmp3,$in1,$tmp1
|
||||
srlw $in1,$in1,$tmp0
|
||||
or $in0,$in0,$tmp3
|
||||
sllw $tmp3,$in2,$tmp1
|
||||
srlw $in2,$in2,$tmp0
|
||||
or $in1,$in1,$tmp3
|
||||
sllw $tmp3,$in3,$tmp1
|
||||
srlw $in3,$in3,$tmp0
|
||||
or $in2,$in2,$tmp3
|
||||
sllw $tmp2,$tmp2,$tmp1
|
||||
or $in3,$in3,$tmp2
|
||||
.Laligned_key:
|
||||
#endif
|
||||
|
||||
lui $tmp0,0x10000
|
||||
addi $tmp0,$tmp0,-1 # 0x0fffffff
|
||||
and $in0,$in0,$tmp0
|
||||
addi $tmp0,$tmp0,-3 # 0x0ffffffc
|
||||
and $in1,$in1,$tmp0
|
||||
and $in2,$in2,$tmp0
|
||||
and $in3,$in3,$tmp0
|
||||
|
||||
sw $in0,20($ctx)
|
||||
sw $in1,24($ctx)
|
||||
sw $in2,28($ctx)
|
||||
sw $in3,32($ctx)
|
||||
|
||||
srlw $tmp1,$in1,2
|
||||
srlw $tmp2,$in2,2
|
||||
srlw $tmp3,$in3,2
|
||||
addw $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
|
||||
addw $in2,$in2,$tmp2
|
||||
addw $in3,$in3,$tmp3
|
||||
sw $in1,36($ctx)
|
||||
sw $in2,40($ctx)
|
||||
sw $in3,44($ctx)
|
||||
.Lno_key:
|
||||
li $a0,0
|
||||
ret
|
||||
.size poly1305_init,.-poly1305_init
|
||||
___
|
||||
{
|
||||
my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
|
||||
($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2);
|
||||
my ($d0,$d1,$d2,$d3) =
|
||||
($a4,$a5,$a6,$a7);
|
||||
my $shr = $ra; # used on R6
|
||||
|
||||
$code.=<<___;
|
||||
.globl poly1305_blocks
|
||||
.type poly1305_blocks,\@function
|
||||
poly1305_blocks:
|
||||
#ifdef __riscv_zicfilp
|
||||
lpad 0
|
||||
#endif
|
||||
andi $len,$len,-16 # complete blocks only
|
||||
beqz $len,.Labort
|
||||
|
||||
#ifdef __riscv_zcmp
|
||||
cm.push {ra,s0-s8}, -48
|
||||
#else
|
||||
caddi $sp,$sp,-__SIZEOF_POINTER__*12
|
||||
PUSH $ra, __SIZEOF_POINTER__*11($sp)
|
||||
PUSH $s0, __SIZEOF_POINTER__*10($sp)
|
||||
PUSH $s1, __SIZEOF_POINTER__*9($sp)
|
||||
PUSH $s2, __SIZEOF_POINTER__*8($sp)
|
||||
PUSH $s3, __SIZEOF_POINTER__*7($sp)
|
||||
PUSH $s4, __SIZEOF_POINTER__*6($sp)
|
||||
PUSH $s5, __SIZEOF_POINTER__*5($sp)
|
||||
PUSH $s6, __SIZEOF_POINTER__*4($sp)
|
||||
PUSH $s7, __SIZEOF_POINTER__*3($sp)
|
||||
PUSH $s8, __SIZEOF_POINTER__*2($sp)
|
||||
#endif
|
||||
|
||||
#ifndef __riscv_misaligned_fast
|
||||
andi $shr,$inp,3
|
||||
andi $inp,$inp,-4 # align $inp
|
||||
slli $shr,$shr,3 # byte to bit offset
|
||||
#endif
|
||||
|
||||
lw $h0,0($ctx) # load hash value
|
||||
lw $h1,4($ctx)
|
||||
lw $h2,8($ctx)
|
||||
lw $h3,12($ctx)
|
||||
lw $h4,16($ctx)
|
||||
|
||||
lw $r0,20($ctx) # load key
|
||||
lw $r1,24($ctx)
|
||||
lw $r2,28($ctx)
|
||||
lw $r3,32($ctx)
|
||||
lw $rs1,36($ctx)
|
||||
lw $rs2,40($ctx)
|
||||
lw $rs3,44($ctx)
|
||||
|
||||
add $len,$len,$inp # end of buffer
|
||||
|
||||
.Loop:
|
||||
lw $d0,0($inp) # load input
|
||||
lw $d1,4($inp)
|
||||
lw $d2,8($inp)
|
||||
lw $d3,12($inp)
|
||||
#ifndef __riscv_misaligned_fast
|
||||
beqz $shr,.Laligned_inp
|
||||
|
||||
lw $t4,16($inp)
|
||||
sub $t5,$zero,$shr
|
||||
srlw $d0,$d0,$shr
|
||||
sllw $t3,$d1,$t5
|
||||
srlw $d1,$d1,$shr
|
||||
or $d0,$d0,$t3
|
||||
sllw $t3,$d2,$t5
|
||||
srlw $d2,$d2,$shr
|
||||
or $d1,$d1,$t3
|
||||
sllw $t3,$d3,$t5
|
||||
srlw $d3,$d3,$shr
|
||||
or $d2,$d2,$t3
|
||||
sllw $t4,$t4,$t5
|
||||
or $d3,$d3,$t4
|
||||
|
||||
.Laligned_inp:
|
||||
#endif
|
||||
srliw $t3,$h4,2 # modulo-scheduled reduction
|
||||
andi $t4,$h4,-4
|
||||
andi $h4,$h4,3
|
||||
|
||||
addw $d0,$d0,$h0 # accumulate input
|
||||
addw $t4,$t4,$t3
|
||||
sltu $h0,$d0,$h0
|
||||
addw $d0,$d0,$t4 # ... and residue
|
||||
sltu $t4,$d0,$t4
|
||||
|
||||
addw $d1,$d1,$h1
|
||||
addw $h0,$h0,$t4 # carry
|
||||
sltu $h1,$d1,$h1
|
||||
addw $d1,$d1,$h0
|
||||
sltu $h0,$d1,$h0
|
||||
|
||||
addw $d2,$d2,$h2
|
||||
addw $h1,$h1,$h0 # carry
|
||||
sltu $h2,$d2,$h2
|
||||
addw $d2,$d2,$h1
|
||||
sltu $h1,$d2,$h1
|
||||
|
||||
addw $d3,$d3,$h3
|
||||
addw $h2,$h2,$h1 # carry
|
||||
sltu $h3,$d3,$h3
|
||||
addw $d3,$d3,$h2
|
||||
|
||||
MULX ($h1,$h0,$r0,$d0) # d0*r0
|
||||
|
||||
sltu $h2,$d3,$h2
|
||||
addw $h3,$h3,$h2 # carry
|
||||
|
||||
MULX ($t4,$t3,$rs3,$d1) # d1*s3
|
||||
|
||||
addw $h4,$h4,$padbit
|
||||
caddi $inp,$inp,16
|
||||
addw $h4,$h4,$h3
|
||||
|
||||
MULX ($t6,$a3,$rs2,$d2) # d2*s2
|
||||
addw $h0,$h0,$t3
|
||||
addw $h1,$h1,$t4
|
||||
sltu $t3,$h0,$t3
|
||||
addw $h1,$h1,$t3
|
||||
|
||||
MULX ($t4,$t3,$rs1,$d3) # d3*s1
|
||||
addw $h0,$h0,$a3
|
||||
addw $h1,$h1,$t6
|
||||
sltu $a3,$h0,$a3
|
||||
addw $h1,$h1,$a3
|
||||
|
||||
|
||||
MULX ($h2,$a3,$r1,$d0) # d0*r1
|
||||
addw $h0,$h0,$t3
|
||||
addw $h1,$h1,$t4
|
||||
sltu $t3,$h0,$t3
|
||||
addw $h1,$h1,$t3
|
||||
|
||||
MULX ($t4,$t3,$r0,$d1) # d1*r0
|
||||
addw $h1,$h1,$a3
|
||||
sltu $a3,$h1,$a3
|
||||
addw $h2,$h2,$a3
|
||||
|
||||
MULX ($t6,$a3,$rs3,$d2) # d2*s3
|
||||
addw $h1,$h1,$t3
|
||||
addw $h2,$h2,$t4
|
||||
sltu $t3,$h1,$t3
|
||||
addw $h2,$h2,$t3
|
||||
|
||||
MULX ($t4,$t3,$rs2,$d3) # d3*s2
|
||||
addw $h1,$h1,$a3
|
||||
addw $h2,$h2,$t6
|
||||
sltu $a3,$h1,$a3
|
||||
addw $h2,$h2,$a3
|
||||
|
||||
mulw $a3,$rs1,$h4 # h4*s1
|
||||
addw $h1,$h1,$t3
|
||||
addw $h2,$h2,$t4
|
||||
sltu $t3,$h1,$t3
|
||||
addw $h2,$h2,$t3
|
||||
|
||||
|
||||
MULX ($h3,$t3,$r2,$d0) # d0*r2
|
||||
addw $h1,$h1,$a3
|
||||
sltu $a3,$h1,$a3
|
||||
addw $h2,$h2,$a3
|
||||
|
||||
MULX ($t6,$a3,$r1,$d1) # d1*r1
|
||||
addw $h2,$h2,$t3
|
||||
sltu $t3,$h2,$t3
|
||||
addw $h3,$h3,$t3
|
||||
|
||||
MULX ($t4,$t3,$r0,$d2) # d2*r0
|
||||
addw $h2,$h2,$a3
|
||||
addw $h3,$h3,$t6
|
||||
sltu $a3,$h2,$a3
|
||||
addw $h3,$h3,$a3
|
||||
|
||||
MULX ($t6,$a3,$rs3,$d3) # d3*s3
|
||||
addw $h2,$h2,$t3
|
||||
addw $h3,$h3,$t4
|
||||
sltu $t3,$h2,$t3
|
||||
addw $h3,$h3,$t3
|
||||
|
||||
mulw $t3,$rs2,$h4 # h4*s2
|
||||
addw $h2,$h2,$a3
|
||||
addw $h3,$h3,$t6
|
||||
sltu $a3,$h2,$a3
|
||||
addw $h3,$h3,$a3
|
||||
|
||||
|
||||
MULX ($t6,$a3,$r3,$d0) # d0*r3
|
||||
addw $h2,$h2,$t3
|
||||
sltu $t3,$h2,$t3
|
||||
addw $h3,$h3,$t3
|
||||
|
||||
MULX ($t4,$t3,$r2,$d1) # d1*r2
|
||||
addw $h3,$h3,$a3
|
||||
sltu $a3,$h3,$a3
|
||||
addw $t6,$t6,$a3
|
||||
|
||||
MULX ($a3,$d3,$r0,$d3) # d3*r0
|
||||
addw $h3,$h3,$t3
|
||||
addw $t6,$t6,$t4
|
||||
sltu $t3,$h3,$t3
|
||||
addw $t6,$t6,$t3
|
||||
|
||||
MULX ($t4,$t3,$r1,$d2) # d2*r1
|
||||
addw $h3,$h3,$d3
|
||||
addw $t6,$t6,$a3
|
||||
sltu $d3,$h3,$d3
|
||||
addw $t6,$t6,$d3
|
||||
|
||||
mulw $a3,$rs3,$h4 # h4*s3
|
||||
addw $h3,$h3,$t3
|
||||
addw $t6,$t6,$t4
|
||||
sltu $t3,$h3,$t3
|
||||
addw $t6,$t6,$t3
|
||||
|
||||
|
||||
mulw $h4,$r0,$h4 # h4*r0
|
||||
addw $h3,$h3,$a3
|
||||
sltu $a3,$h3,$a3
|
||||
addw $t6,$t6,$a3
|
||||
addw $h4,$t6,$h4
|
||||
|
||||
li $padbit,1 # if we loop, padbit is 1
|
||||
|
||||
bne $inp,$len,.Loop
|
||||
|
||||
sw $h0,0($ctx) # store hash value
|
||||
sw $h1,4($ctx)
|
||||
sw $h2,8($ctx)
|
||||
sw $h3,12($ctx)
|
||||
sw $h4,16($ctx)
|
||||
|
||||
#ifdef __riscv_zcmp
|
||||
cm.popret {ra,s0-s8}, 48
|
||||
#else
|
||||
POP $ra, __SIZEOF_POINTER__*11($sp)
|
||||
POP $s0, __SIZEOF_POINTER__*10($sp)
|
||||
POP $s1, __SIZEOF_POINTER__*9($sp)
|
||||
POP $s2, __SIZEOF_POINTER__*8($sp)
|
||||
POP $s3, __SIZEOF_POINTER__*7($sp)
|
||||
POP $s4, __SIZEOF_POINTER__*6($sp)
|
||||
POP $s5, __SIZEOF_POINTER__*5($sp)
|
||||
POP $s6, __SIZEOF_POINTER__*4($sp)
|
||||
POP $s7, __SIZEOF_POINTER__*3($sp)
|
||||
POP $s8, __SIZEOF_POINTER__*2($sp)
|
||||
caddi $sp,$sp,__SIZEOF_POINTER__*12
|
||||
#endif
|
||||
.Labort:
|
||||
ret
|
||||
.size poly1305_blocks,.-poly1305_blocks
|
||||
___
|
||||
}
|
||||
{
|
||||
my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
|
||||
|
||||
$code.=<<___;
|
||||
.globl poly1305_emit
|
||||
.type poly1305_emit,\@function
|
||||
poly1305_emit:
|
||||
#ifdef __riscv_zicfilp
|
||||
lpad 0
|
||||
#endif
|
||||
lw $tmp4,16($ctx)
|
||||
lw $tmp0,0($ctx)
|
||||
lw $tmp1,4($ctx)
|
||||
lw $tmp2,8($ctx)
|
||||
lw $tmp3,12($ctx)
|
||||
|
||||
srliw $ctx,$tmp4,2 # final reduction
|
||||
andi $in0,$tmp4,-4
|
||||
andi $tmp4,$tmp4,3
|
||||
addw $ctx,$ctx,$in0
|
||||
|
||||
addw $tmp0,$tmp0,$ctx
|
||||
sltu $ctx,$tmp0,$ctx
|
||||
addiw $in0,$tmp0,5 # compare to modulus
|
||||
addw $tmp1,$tmp1,$ctx
|
||||
sltiu $in1,$in0,5
|
||||
sltu $ctx,$tmp1,$ctx
|
||||
addw $in1,$in1,$tmp1
|
||||
addw $tmp2,$tmp2,$ctx
|
||||
sltu $in2,$in1,$tmp1
|
||||
sltu $ctx,$tmp2,$ctx
|
||||
addw $in2,$in2,$tmp2
|
||||
addw $tmp3,$tmp3,$ctx
|
||||
sltu $in3,$in2,$tmp2
|
||||
sltu $ctx,$tmp3,$ctx
|
||||
addw $in3,$in3,$tmp3
|
||||
addw $tmp4,$tmp4,$ctx
|
||||
sltu $ctx,$in3,$tmp3
|
||||
addw $ctx,$ctx,$tmp4
|
||||
|
||||
srl $ctx,$ctx,2 # see if it carried/borrowed
|
||||
sub $ctx,$zero,$ctx
|
||||
|
||||
xor $in0,$in0,$tmp0
|
||||
xor $in1,$in1,$tmp1
|
||||
xor $in2,$in2,$tmp2
|
||||
xor $in3,$in3,$tmp3
|
||||
and $in0,$in0,$ctx
|
||||
and $in1,$in1,$ctx
|
||||
and $in2,$in2,$ctx
|
||||
and $in3,$in3,$ctx
|
||||
xor $in0,$in0,$tmp0
|
||||
xor $in1,$in1,$tmp1
|
||||
xor $in2,$in2,$tmp2
|
||||
xor $in3,$in3,$tmp3
|
||||
|
||||
lw $tmp0,0($nonce) # load nonce
|
||||
lw $tmp1,4($nonce)
|
||||
lw $tmp2,8($nonce)
|
||||
lw $tmp3,12($nonce)
|
||||
|
||||
addw $in0,$in0,$tmp0 # accumulate nonce
|
||||
sltu $ctx,$in0,$tmp0
|
||||
|
||||
addw $in1,$in1,$tmp1
|
||||
sltu $tmp1,$in1,$tmp1
|
||||
addw $in1,$in1,$ctx
|
||||
sltu $ctx,$in1,$ctx
|
||||
addw $ctx,$ctx,$tmp1
|
||||
|
||||
addw $in2,$in2,$tmp2
|
||||
sltu $tmp2,$in2,$tmp2
|
||||
addw $in2,$in2,$ctx
|
||||
sltu $ctx,$in2,$ctx
|
||||
addw $ctx,$ctx,$tmp2
|
||||
|
||||
addw $in3,$in3,$tmp3
|
||||
addw $in3,$in3,$ctx
|
||||
|
||||
#ifdef __riscv_misaligned_fast
|
||||
sw $in0,0($mac) # write mac value
|
||||
sw $in1,4($mac)
|
||||
sw $in2,8($mac)
|
||||
sw $in3,12($mac)
|
||||
#else
|
||||
srl $tmp0,$in0,8 # write mac value
|
||||
srl $tmp1,$in0,16
|
||||
srl $tmp2,$in0,24
|
||||
sb $in0, 0($mac)
|
||||
sb $tmp0,1($mac)
|
||||
srl $tmp0,$in1,8
|
||||
sb $tmp1,2($mac)
|
||||
srl $tmp1,$in1,16
|
||||
sb $tmp2,3($mac)
|
||||
srl $tmp2,$in1,24
|
||||
sb $in1, 4($mac)
|
||||
sb $tmp0,5($mac)
|
||||
srl $tmp0,$in2,8
|
||||
sb $tmp1,6($mac)
|
||||
srl $tmp1,$in2,16
|
||||
sb $tmp2,7($mac)
|
||||
srl $tmp2,$in2,24
|
||||
sb $in2, 8($mac)
|
||||
sb $tmp0,9($mac)
|
||||
srl $tmp0,$in3,8
|
||||
sb $tmp1,10($mac)
|
||||
srl $tmp1,$in3,16
|
||||
sb $tmp2,11($mac)
|
||||
srl $tmp2,$in3,24
|
||||
sb $in3, 12($mac)
|
||||
sb $tmp0,13($mac)
|
||||
sb $tmp1,14($mac)
|
||||
sb $tmp2,15($mac)
|
||||
#endif
|
||||
|
||||
ret
|
||||
.size poly1305_emit,.-poly1305_emit
|
||||
.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
|
||||
___
|
||||
}
|
||||
}}}
|
||||
|
||||
foreach (split("\n", $code)) {
|
||||
if ($flavour =~ /^cheri/) {
|
||||
s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/;
|
||||
s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or
|
||||
s/\b(ret|jal)\b/c$1/;
|
||||
s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or
|
||||
m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g;
|
||||
} else {
|
||||
s/\bcaddi?\b/add/ or
|
||||
s/\bcmove\b/mv/;
|
||||
}
|
||||
print $_, "\n";
|
||||
}
|
||||
|
||||
close STDOUT;
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
* OpenSSL/Cryptogams accelerated Poly1305 transform for riscv
|
||||
*
|
||||
* Copyright (C) 2025 Institute of Software, CAS.
|
||||
*/
|
||||
|
||||
asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
|
||||
const u8 raw_key[POLY1305_BLOCK_SIZE]);
|
||||
asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
|
||||
const u8 *src, u32 len, u32 hibit);
|
||||
asmlinkage void poly1305_emit(const struct poly1305_state *state,
|
||||
u8 digest[POLY1305_DIGEST_SIZE],
|
||||
const u32 nonce[4]);
|
||||
Loading…
Reference in New Issue