313 lines
8.3 KiB
C
313 lines
8.3 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Authors: Jackie Liu <liuyun01@kylinos.cn>
|
|
* Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
|
|
*/
|
|
|
|
#include <linux/cache.h>
|
|
#include <asm/neon-intrinsics.h>
|
|
#include "xor_impl.h"
|
|
#include "xor_arch.h"
|
|
#include "xor-neon.h"
|
|
|
|
static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
|
|
const unsigned long * __restrict p2)
|
|
{
|
|
uint64_t *dp1 = (uint64_t *)p1;
|
|
uint64_t *dp2 = (uint64_t *)p2;
|
|
|
|
register uint64x2_t v0, v1, v2, v3;
|
|
long lines = bytes / (sizeof(uint64x2_t) * 4);
|
|
|
|
do {
|
|
/* p1 ^= p2 */
|
|
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
|
|
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
|
|
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
|
|
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
|
|
|
|
/* store */
|
|
vst1q_u64(dp1 + 0, v0);
|
|
vst1q_u64(dp1 + 2, v1);
|
|
vst1q_u64(dp1 + 4, v2);
|
|
vst1q_u64(dp1 + 6, v3);
|
|
|
|
dp1 += 8;
|
|
dp2 += 8;
|
|
} while (--lines > 0);
|
|
}
|
|
|
|
static void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
|
|
const unsigned long * __restrict p2,
|
|
const unsigned long * __restrict p3)
|
|
{
|
|
uint64_t *dp1 = (uint64_t *)p1;
|
|
uint64_t *dp2 = (uint64_t *)p2;
|
|
uint64_t *dp3 = (uint64_t *)p3;
|
|
|
|
register uint64x2_t v0, v1, v2, v3;
|
|
long lines = bytes / (sizeof(uint64x2_t) * 4);
|
|
|
|
do {
|
|
/* p1 ^= p2 */
|
|
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
|
|
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
|
|
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
|
|
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
|
|
|
|
/* p1 ^= p3 */
|
|
v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
|
|
v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
|
|
v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
|
|
v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
|
|
|
|
/* store */
|
|
vst1q_u64(dp1 + 0, v0);
|
|
vst1q_u64(dp1 + 2, v1);
|
|
vst1q_u64(dp1 + 4, v2);
|
|
vst1q_u64(dp1 + 6, v3);
|
|
|
|
dp1 += 8;
|
|
dp2 += 8;
|
|
dp3 += 8;
|
|
} while (--lines > 0);
|
|
}
|
|
|
|
static void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
|
|
const unsigned long * __restrict p2,
|
|
const unsigned long * __restrict p3,
|
|
const unsigned long * __restrict p4)
|
|
{
|
|
uint64_t *dp1 = (uint64_t *)p1;
|
|
uint64_t *dp2 = (uint64_t *)p2;
|
|
uint64_t *dp3 = (uint64_t *)p3;
|
|
uint64_t *dp4 = (uint64_t *)p4;
|
|
|
|
register uint64x2_t v0, v1, v2, v3;
|
|
long lines = bytes / (sizeof(uint64x2_t) * 4);
|
|
|
|
do {
|
|
/* p1 ^= p2 */
|
|
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
|
|
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
|
|
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
|
|
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
|
|
|
|
/* p1 ^= p3 */
|
|
v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
|
|
v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
|
|
v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
|
|
v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
|
|
|
|
/* p1 ^= p4 */
|
|
v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
|
|
v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
|
|
v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
|
|
v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
|
|
|
|
/* store */
|
|
vst1q_u64(dp1 + 0, v0);
|
|
vst1q_u64(dp1 + 2, v1);
|
|
vst1q_u64(dp1 + 4, v2);
|
|
vst1q_u64(dp1 + 6, v3);
|
|
|
|
dp1 += 8;
|
|
dp2 += 8;
|
|
dp3 += 8;
|
|
dp4 += 8;
|
|
} while (--lines > 0);
|
|
}
|
|
|
|
static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
|
|
const unsigned long * __restrict p2,
|
|
const unsigned long * __restrict p3,
|
|
const unsigned long * __restrict p4,
|
|
const unsigned long * __restrict p5)
|
|
{
|
|
uint64_t *dp1 = (uint64_t *)p1;
|
|
uint64_t *dp2 = (uint64_t *)p2;
|
|
uint64_t *dp3 = (uint64_t *)p3;
|
|
uint64_t *dp4 = (uint64_t *)p4;
|
|
uint64_t *dp5 = (uint64_t *)p5;
|
|
|
|
register uint64x2_t v0, v1, v2, v3;
|
|
long lines = bytes / (sizeof(uint64x2_t) * 4);
|
|
|
|
do {
|
|
/* p1 ^= p2 */
|
|
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
|
|
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
|
|
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
|
|
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
|
|
|
|
/* p1 ^= p3 */
|
|
v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
|
|
v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
|
|
v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
|
|
v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
|
|
|
|
/* p1 ^= p4 */
|
|
v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
|
|
v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
|
|
v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
|
|
v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
|
|
|
|
/* p1 ^= p5 */
|
|
v0 = veorq_u64(v0, vld1q_u64(dp5 + 0));
|
|
v1 = veorq_u64(v1, vld1q_u64(dp5 + 2));
|
|
v2 = veorq_u64(v2, vld1q_u64(dp5 + 4));
|
|
v3 = veorq_u64(v3, vld1q_u64(dp5 + 6));
|
|
|
|
/* store */
|
|
vst1q_u64(dp1 + 0, v0);
|
|
vst1q_u64(dp1 + 2, v1);
|
|
vst1q_u64(dp1 + 4, v2);
|
|
vst1q_u64(dp1 + 6, v3);
|
|
|
|
dp1 += 8;
|
|
dp2 += 8;
|
|
dp3 += 8;
|
|
dp4 += 8;
|
|
dp5 += 8;
|
|
} while (--lines > 0);
|
|
}
|
|
|
|
__DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4,
|
|
__xor_neon_5);
|
|
|
|
static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
|
|
{
|
|
uint64x2_t res;
|
|
|
|
asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
|
|
"eor3 %0.16b, %1.16b, %2.16b, %3.16b"
|
|
: "=w"(res) : "w"(p), "w"(q), "w"(r));
|
|
return res;
|
|
}
|
|
|
|
static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
|
|
const unsigned long * __restrict p2,
|
|
const unsigned long * __restrict p3)
|
|
{
|
|
uint64_t *dp1 = (uint64_t *)p1;
|
|
uint64_t *dp2 = (uint64_t *)p2;
|
|
uint64_t *dp3 = (uint64_t *)p3;
|
|
|
|
register uint64x2_t v0, v1, v2, v3;
|
|
long lines = bytes / (sizeof(uint64x2_t) * 4);
|
|
|
|
do {
|
|
/* p1 ^= p2 ^ p3 */
|
|
v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
|
|
vld1q_u64(dp3 + 0));
|
|
v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
|
|
vld1q_u64(dp3 + 2));
|
|
v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
|
|
vld1q_u64(dp3 + 4));
|
|
v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
|
|
vld1q_u64(dp3 + 6));
|
|
|
|
/* store */
|
|
vst1q_u64(dp1 + 0, v0);
|
|
vst1q_u64(dp1 + 2, v1);
|
|
vst1q_u64(dp1 + 4, v2);
|
|
vst1q_u64(dp1 + 6, v3);
|
|
|
|
dp1 += 8;
|
|
dp2 += 8;
|
|
dp3 += 8;
|
|
} while (--lines > 0);
|
|
}
|
|
|
|
static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
|
|
const unsigned long * __restrict p2,
|
|
const unsigned long * __restrict p3,
|
|
const unsigned long * __restrict p4)
|
|
{
|
|
uint64_t *dp1 = (uint64_t *)p1;
|
|
uint64_t *dp2 = (uint64_t *)p2;
|
|
uint64_t *dp3 = (uint64_t *)p3;
|
|
uint64_t *dp4 = (uint64_t *)p4;
|
|
|
|
register uint64x2_t v0, v1, v2, v3;
|
|
long lines = bytes / (sizeof(uint64x2_t) * 4);
|
|
|
|
do {
|
|
/* p1 ^= p2 ^ p3 */
|
|
v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
|
|
vld1q_u64(dp3 + 0));
|
|
v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
|
|
vld1q_u64(dp3 + 2));
|
|
v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
|
|
vld1q_u64(dp3 + 4));
|
|
v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
|
|
vld1q_u64(dp3 + 6));
|
|
|
|
/* p1 ^= p4 */
|
|
v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
|
|
v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
|
|
v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
|
|
v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
|
|
|
|
/* store */
|
|
vst1q_u64(dp1 + 0, v0);
|
|
vst1q_u64(dp1 + 2, v1);
|
|
vst1q_u64(dp1 + 4, v2);
|
|
vst1q_u64(dp1 + 6, v3);
|
|
|
|
dp1 += 8;
|
|
dp2 += 8;
|
|
dp3 += 8;
|
|
dp4 += 8;
|
|
} while (--lines > 0);
|
|
}
|
|
|
|
static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
|
|
const unsigned long * __restrict p2,
|
|
const unsigned long * __restrict p3,
|
|
const unsigned long * __restrict p4,
|
|
const unsigned long * __restrict p5)
|
|
{
|
|
uint64_t *dp1 = (uint64_t *)p1;
|
|
uint64_t *dp2 = (uint64_t *)p2;
|
|
uint64_t *dp3 = (uint64_t *)p3;
|
|
uint64_t *dp4 = (uint64_t *)p4;
|
|
uint64_t *dp5 = (uint64_t *)p5;
|
|
|
|
register uint64x2_t v0, v1, v2, v3;
|
|
long lines = bytes / (sizeof(uint64x2_t) * 4);
|
|
|
|
do {
|
|
/* p1 ^= p2 ^ p3 */
|
|
v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
|
|
vld1q_u64(dp3 + 0));
|
|
v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
|
|
vld1q_u64(dp3 + 2));
|
|
v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
|
|
vld1q_u64(dp3 + 4));
|
|
v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
|
|
vld1q_u64(dp3 + 6));
|
|
|
|
/* p1 ^= p4 ^ p5 */
|
|
v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
|
|
v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
|
|
v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
|
|
v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
|
|
|
|
/* store */
|
|
vst1q_u64(dp1 + 0, v0);
|
|
vst1q_u64(dp1 + 2, v1);
|
|
vst1q_u64(dp1 + 4, v2);
|
|
vst1q_u64(dp1 + 6, v3);
|
|
|
|
dp1 += 8;
|
|
dp2 += 8;
|
|
dp3 += 8;
|
|
dp4 += 8;
|
|
dp5 += 8;
|
|
} while (--lines > 0);
|
|
}
|
|
|
|
__DO_XOR_BLOCKS(eor3_inner, __xor_neon_2, __xor_eor3_3, __xor_eor3_4,
|
|
__xor_eor3_5);
|