diff --git a/.wolfssl_known_macro_extras b/.wolfssl_known_macro_extras index 28a171f4fb..e1b77616de 100644 --- a/.wolfssl_known_macro_extras +++ b/.wolfssl_known_macro_extras @@ -887,6 +887,7 @@ WOLFSSL_RENESAS_RZN2L WOLFSSL_RENESAS_TLS WOLFSSL_RENESAS_TSIP_IAREWRX WOLFSSL_REQUIRE_TCA +WOLFSSL_RISCV_ASM_NO_UNALIGNED WOLFSSL_RNG_USE_FULL_SEED WOLFSSL_RSA_CHECK_D_ON_DECRYPT WOLFSSL_RSA_DECRYPT_TO_0_LEN diff --git a/configure.ac b/configure.ac index 24ab654b4e..f214060845 100644 --- a/configure.ac +++ b/configure.ac @@ -3859,7 +3859,7 @@ do # FSL, FSR, FSRI, CMOV, CMIX - QEMU doesn't know about these instructions AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_BIT_MANIPULATION_TERNARY" ;; - zkn|zkned) + zkned) # AES encrypt/decrpyt, SHA-2 ENABLED_RISCV_ASM=yes AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_SCALAR_CRYPTO_ASM" diff --git a/wolfcrypt/src/port/riscv/riscv-64-aes.c b/wolfcrypt/src/port/riscv/riscv-64-aes.c index 37918217e9..64d77a3630 100644 --- a/wolfcrypt/src/port/riscv/riscv-64-aes.c +++ b/wolfcrypt/src/port/riscv/riscv-64-aes.c @@ -1871,8 +1871,7 @@ int wc_AesSetKey(Aes* aes, const byte* key, word32 keyLen, const byte* iv, static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out) { __asm__ __volatile__ ( - "ld t2, 0(%[in])\n\t" - "ld t3, 8(%[in])\n\t" + UNALIGNED_LD2(t2, t3, 0, %[in], t0) "ld a3, 0(%[key])\n\t" "ld a4, 8(%[key])\n\t" "ld a5, 16(%[key])\n\t" @@ -1897,8 +1896,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out) AESENC_2_ROUNDS(208, 216, 224, 232) "L_aes_encrypt_done:\n\t" AESENC_LAST_ROUND() - "sd t2, 0(%[out])\n\t" - "sd t3, 8(%[out])\n\t" + UNALIGNED_SD2(t2, t3, 0, %[out], t0) : : [in] "r" (in), [out] "r" (out), [key] "r" (aes->key), [rounds] "r" (aes->rounds) @@ -1918,8 +1916,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out) static void wc_AesDecrypt(Aes* aes, const byte* in, byte* out) { __asm__ __volatile__ ( - "ld t2, 0(%[in])\n\t" - "ld t3, 8(%[in])\n\t" + UNALIGNED_LD2(t2, t3, 0, %[in], t0) "ld a3, 0(%[key])\n\t" "ld a4, 8(%[key])\n\t" "ld a5, 16(%[key])\n\t" @@ -1944,8 +1941,7 @@ static void wc_AesDecrypt(Aes* aes, const byte* in, byte* out) AESDEC_2_ROUNDS(208, 216, 224, 232) "L_aes_decrypt_done:\n\t" AESDEC_LAST_ROUND() - "sd t2, 0(%[out])\n\t" - "sd t3, 8(%[out])\n\t" + UNALIGNED_SD2(t2, t3, 0, %[out], t0) : : [in] "r" (in), [out] "r" (out), [key] "r" (aes->key), [rounds] "r" (aes->rounds) @@ -3209,8 +3205,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out) LOAD_WORD_REV(t2, 8, %[in]) LOAD_WORD_REV(t3, 12, %[in]) #else - "ld t1, 0(%[in])\n\t" - "ld t3, 8(%[in])\n\t" + UNALIGNED_LD2(t1, t3, 0, %[in], t0) REV8(REG_T1, REG_T1) REV8(REG_T3, REG_T3) "srli t0, t1, 32\n\t" @@ -3376,16 +3371,14 @@ static void wc_AesEncrypt(Aes* aes, const byte* in, byte* out) REV8(REG_T1, REG_T1) REV8(REG_T3, REG_T3) /* Write encrypted block to output. */ - "sd t1, 0(%[out])\n\t" - "sd t3, 8(%[out])\n\t" + UNALIGNED_SD2(t1, t3, 0, %[out], t0) #else PACK(REG_T1, REG_A5, REG_A4) PACK(REG_T3, REG_A7, REG_A6) REV8(REG_T1, REG_T1) REV8(REG_T3, REG_T3) /* Write encrypted block to output. */ - "sd t1, 0(%[out])\n\t" - "sd t3, 8(%[out])\n\t" + UNALIGNED_SD2(t1, t3, 0, %[out], t0) #endif : @@ -3641,8 +3634,7 @@ static void wc_AesDecrypt(Aes* aes, const byte* in, byte* out) LOAD_WORD_REV(t2, 8, %[in]) LOAD_WORD_REV(t3, 12, %[in]) #else - "ld t1, 0(%[in])\n\t" - "ld t3, 8(%[in])\n\t" + UNALIGNED_LD2(t1, t3, 0, %[in], t0) REV8(REG_T1, REG_T1) REV8(REG_T3, REG_T3) "srli t0, t1, 32\n\t" @@ -3793,16 +3785,14 @@ static void wc_AesDecrypt(Aes* aes, const byte* in, byte* out) REV8(REG_T1, REG_T1) REV8(REG_T3, REG_T3) /* Write encrypted block to output. */ - "sd t1, 0(%[out])\n\t" - "sd t3, 8(%[out])\n\t" + UNALIGNED_SD2(t1, t3, 0, %[out], t0) #else PACK(REG_T1, REG_A5, REG_A4) PACK(REG_T3, REG_A7, REG_A6) REV8(REG_T1, REG_T1) REV8(REG_T3, REG_T3) /* Write encrypted block to output. */ - "sd t1, 0(%[out])\n\t" - "sd t3, 8(%[out])\n\t" + UNALIGNED_SD2(t1, t3, 0, %[out], t0) #endif : @@ -4113,7 +4103,7 @@ static WC_INLINE void IncrementAesCounter(byte* inOutCtr) */ int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) { - byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; word32 processed; int ret = 0; @@ -4563,8 +4553,8 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz, byte* s, word32 sSz) { if (gcm != NULL) { - byte x[WC_AES_BLOCK_SIZE]; - byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte x[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; byte* h = gcm->H; __asm__ __volatile__ ( @@ -4896,8 +4886,8 @@ static void GMULT(byte* x, byte* y) void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz, byte* s, word32 sSz) { - byte x[WC_AES_BLOCK_SIZE]; - byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte x[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; word32 blocks, partial; byte* h; @@ -5163,8 +5153,7 @@ static void ghash_blocks(byte* x, byte* y, const byte* in, word32 blocks) "L_ghash_loop:\n\t" /* Load input block. */ - "ld t5, 0(%[in])\n\t" - "ld a5, 8(%[in])\n\t" + UNALIGNED_LD2(t5, a5, 0, %[in], t4) /* Reverse bits to match x. */ #ifdef WOLFSSL_RISCV_BIT_MANIPULATION BREV8(REG_T5, REG_T5) @@ -5307,8 +5296,8 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz, byte* s, word32 sSz) { if (gcm != NULL) { - byte x[WC_AES_BLOCK_SIZE]; - byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte x[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; word32 blocks, partial; byte* h = gcm->H; @@ -5388,8 +5377,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, const byte* nonce, word32 nonceSz, byte* tag, word32 tagSz, const byte* aad, word32 aadSz) { - byte counter[WC_AES_BLOCK_SIZE]; - byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte counter[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; /* Noticed different optimization levels treated head of array different. * Some cases was stack pointer plus offset others was a register containing * address. To make uniform for passing in to inline assembly code am using @@ -5886,8 +5875,8 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, const byte* nonce, word32 nonceSz, byte* tag, word32 tagSz, const byte* aad, word32 aadSz) { - byte counter[WC_AES_BLOCK_SIZE]; - byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte counter[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; /* Noticed different optimization levels treated head of array different. * Some cases was stack pointer plus offset others was a register containing * address. To make uniform for passing in to inline assembly code am using @@ -6398,8 +6387,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, const byte* nonce, word32 nonceSz, byte* tag, word32 tagSz, const byte* aad, word32 aadSz) { - byte counter[WC_AES_BLOCK_SIZE]; - byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte counter[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; /* Noticed different optimization levels treated head of array different. * Some cases was stack pointer plus offset others was a register containing * address. To make uniform for passing in to inline assembly code am using @@ -7003,8 +6992,8 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, const byte* aad, word32 aadSz) { int ret = 0; - byte counter[WC_AES_BLOCK_SIZE]; - byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte counter[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; /* Noticed different optimization levels treated head of array different. * Some cases was stack pointer plus offset others was a register containing * address. To make uniform for passing in to inline assembly code am using @@ -7512,8 +7501,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, const byte* aad, word32 aadSz) { int ret = 0; - byte counter[WC_AES_BLOCK_SIZE]; - byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte counter[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; /* Noticed different optimization levels treated head of array different. * Some cases was stack pointer plus offset others was a register containing * address. To make uniform for passing in to inline assembly code am using @@ -8035,8 +8024,8 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, const byte* aad, word32 aadSz) { int ret = 0; - byte counter[WC_AES_BLOCK_SIZE]; - byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte counter[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; /* Noticed different optimization levels treated head of array different. * Some cases was stack pointer plus offset others was a register containing * address. To make uniform for passing in to inline assembly code am using @@ -8733,8 +8722,8 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz, byte* s, word32 sSz) { if (gcm != NULL) { - byte x[WC_AES_BLOCK_SIZE]; - byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte x[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; word32 blocks, partial; XMEMSET(x, 0, WC_AES_BLOCK_SIZE); @@ -8834,9 +8823,9 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, word32 partial = sz % WC_AES_BLOCK_SIZE; const byte* p = in; byte* c = out; - ALIGN16 byte counter[WC_AES_BLOCK_SIZE]; - ALIGN16 byte initialCounter[WC_AES_BLOCK_SIZE]; - ALIGN16 byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte counter[WC_AES_BLOCK_SIZE]; + ALIGN8 byte initialCounter[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; /* Validate parameters. */ if ((aes == NULL) || (nonce == NULL) || (nonceSz == 0) || (tag == NULL) || @@ -8934,10 +8923,10 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, word32 partial = sz % WC_AES_BLOCK_SIZE; const byte* c = in; byte* p = out; - ALIGN16 byte counter[WC_AES_BLOCK_SIZE]; - ALIGN16 byte scratch[WC_AES_BLOCK_SIZE]; - ALIGN16 byte Tprime[WC_AES_BLOCK_SIZE]; - ALIGN16 byte EKY0[WC_AES_BLOCK_SIZE]; + ALIGN8 byte counter[WC_AES_BLOCK_SIZE]; + ALIGN8 byte scratch[WC_AES_BLOCK_SIZE]; + ALIGN8 byte Tprime[WC_AES_BLOCK_SIZE]; + ALIGN8 byte EKY0[WC_AES_BLOCK_SIZE]; sword32 res; /* Validate parameters. */ diff --git a/wolfcrypt/src/port/riscv/riscv-64-chacha.c b/wolfcrypt/src/port/riscv/riscv-64-chacha.c index c150e4bb7f..ba0a8b21d3 100644 --- a/wolfcrypt/src/port/riscv/riscv-64-chacha.c +++ b/wolfcrypt/src/port/riscv/riscv-64-chacha.c @@ -1825,9 +1825,9 @@ static WC_INLINE void wc_chacha_encrypt_64(const word32* input, const byte* m, VSETIVLI(REG_X0, 2, 1, 1, 0b011, 0b000) VMV_X_S(REG_T0, REG_V0) VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000) - "ld t1, (%[m])\n\t" + UNALIGNED_LD(t1, 0, %[m], t2) "xor t1, t1, t0\n\t" - "sd t1, (%[c])\n\t" + UNALIGNED_SD(t1, 0, %[c], t2) "addi %[bytes], %[bytes], -8\n\t" "addi %[c], %[c], 8\n\t" "addi %[m], %[m], 8\n\t" @@ -2155,10 +2155,7 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m, "bltz %[bytes], L_chacha20_riscv_over\n\t" #if !defined(WOLFSSL_RISCV_BIT_MANIPULATION) - "ld t0, 0(%[m])\n\t" - "ld t1, 8(%[m])\n\t" - "ld t2, 16(%[m])\n\t" - "ld s1, 24(%[m])\n\t" + UNALIGNED_LD4(t0, t1, t2, s1, 0, %[m], a3) "xor a4, a4, t0\n\t" "xor a6, a6, t1\n\t" "xor t3, t3, t2\n\t" @@ -2171,10 +2168,7 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m, "xor a7, a7, t1\n\t" "xor t4, t4, t2\n\t" "xor t6, t6, s1\n\t" - "ld t0, 32(%[m])\n\t" - "ld t1, 40(%[m])\n\t" - "ld t2, 48(%[m])\n\t" - "ld s1, 56(%[m])\n\t" + UNALIGNED_LD4(t0, t1, t2, s1, 32, %[m], a3) "xor s2, s2, t0\n\t" "xor s4, s4, t1\n\t" "xor s6, s6, t2\n\t" @@ -2187,22 +2181,8 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m, "xor s5, s5, t1\n\t" "xor s7, s7, t2\n\t" "xor s9, s9, s1\n\t" - "sw a4, 0(%[c])\n\t" - "sw a5, 4(%[c])\n\t" - "sw a6, 8(%[c])\n\t" - "sw a7, 12(%[c])\n\t" - "sw t3, 16(%[c])\n\t" - "sw t4, 20(%[c])\n\t" - "sw t5, 24(%[c])\n\t" - "sw t6, 28(%[c])\n\t" - "sw s2, 32(%[c])\n\t" - "sw s3, 36(%[c])\n\t" - "sw s4, 40(%[c])\n\t" - "sw s5, 44(%[c])\n\t" - "sw s6, 48(%[c])\n\t" - "sw s7, 52(%[c])\n\t" - "sw s8, 56(%[c])\n\t" - "sw s9, 60(%[c])\n\t" + UNALIGNED_SW8(a4, a5, a6, a7, t3, t4, t5, t6, 0, %[c], t0) + UNALIGNED_SW8(s2, s3, s4, s5, s6, s7, s8, s9, 32, %[c], t0) #else PACK(REG_A4, REG_A4, REG_A5) PACK(REG_A6, REG_A6, REG_A7) @@ -2212,14 +2192,7 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m, PACK(REG_S4, REG_S4, REG_S5) PACK(REG_S6, REG_S6, REG_S7) PACK(REG_S8, REG_S8, REG_S9) - "ld a5, 0(%[m])\n\t" - "ld a7, 8(%[m])\n\t" - "ld t4, 16(%[m])\n\t" - "ld t6, 24(%[m])\n\t" - "ld s3, 32(%[m])\n\t" - "ld s5, 40(%[m])\n\t" - "ld s7, 48(%[m])\n\t" - "ld s9, 56(%[m])\n\t" + UNALIGNED_LD8(a5, a7, t4, t6, s3, s5, s7, s9, 0, %[m], t0) "xor a4, a4, a5\n\t" "xor a6, a6, a7\n\t" "xor t3, t3, t4\n\t" @@ -2228,14 +2201,7 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m, "xor s4, s4, s5\n\t" "xor s6, s6, s7\n\t" "xor s8, s8, s9\n\t" - "sd a4, 0(%[c])\n\t" - "sd a6, 8(%[c])\n\t" - "sd t3, 16(%[c])\n\t" - "sd t5, 24(%[c])\n\t" - "sd s2, 32(%[c])\n\t" - "sd s4, 40(%[c])\n\t" - "sd s6, 48(%[c])\n\t" - "sd s8, 56(%[c])\n\t" + UNALIGNED_SD8(a4, a6, t3, t5, s2, s4, s6, s8, 0, %[c], t0) #endif "addi %[m], %[m], 64\n\t" @@ -2268,10 +2234,10 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m, "bltz t0, L_chacha20_riscv_32bit\n\t" "addi a3, a3, -1\n\t" "L_chacha20_riscv_64bit_loop:\n\t" - "ld t0, (%[m])\n\t" + UNALIGNED_LD(t0, 0, %[m], t2) "ld t1, (%[over])\n\t" "xor t0, t0, t1\n\t" - "sd t0, (%[c])\n\t" + UNALIGNED_SD(t0, 0, %[c], t2) "addi %[m], %[m], 8\n\t" "addi %[c], %[c], 8\n\t" "addi %[over], %[over], 8\n\t" @@ -2282,10 +2248,10 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m, "L_chacha20_riscv_32bit:\n\t" "addi t0, a3, -4\n\t" "bltz t0, L_chacha20_riscv_16bit\n\t" - "lw t0, (%[m])\n\t" + UNALIGNED_LW(t0, 0, %[m], t2) "lw t1, (%[over])\n\t" "xor t0, t0, t1\n\t" - "sw t0, (%[c])\n\t" + UNALIGNED_SW(t0, 0, %[c], t2) "addi %[m], %[m], 4\n\t" "addi %[c], %[c], 4\n\t" "addi %[over], %[over], 4\n\t" @@ -2293,10 +2259,10 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m, "L_chacha20_riscv_16bit:\n\t" "addi t0, a3, -2\n\t" "bltz t0, L_chacha20_riscv_8bit\n\t" - "lh t0, (%[m])\n\t" + UNALIGNED_LH(t0, 0, %[m], t2) "lh t1, (%[over])\n\t" "xor t0, t0, t1\n\t" - "sh t0, (%[c])\n\t" + UNALIGNED_SH(t0, 0, %[c], t2) "addi %[m], %[m], 2\n\t" "addi %[c], %[c], 2\n\t" "addi %[over], %[over], 2\n\t" diff --git a/wolfcrypt/src/port/riscv/riscv-64-poly1305.c b/wolfcrypt/src/port/riscv/riscv-64-poly1305.c index de846ffde8..5733b75bc7 100644 --- a/wolfcrypt/src/port/riscv/riscv-64-poly1305.c +++ b/wolfcrypt/src/port/riscv/riscv-64-poly1305.c @@ -145,8 +145,7 @@ static WC_INLINE void poly1305_blocks_riscv64_16(Poly1305* ctx, "L_poly1305_riscv64_16_64_loop_%=:\n\t" /* Load m */ - "ld t0, (%[m])\n\t" - "ld t1, 8(%[m])\n\t" + UNALIGNED_LD2(t0, t1, 0, %[m], t5) /* Split m into 26, 52, 52 */ SPLIT_130(t2, t3, t4, t0, t1, %[notLast], t5) @@ -285,8 +284,7 @@ void poly1305_blocks_riscv64(Poly1305* ctx, const unsigned char *m, "L_poly1305_riscv64_vec_loop_%=:\n\t" /* m0 + nfin */ - "ld t0, 0(%[m])\n\t" - "ld t1, 8(%[m])\n\t" + UNALIGNED_LD2(t0, t1, 0, %[m], t5) "li t6, 1\n\t" /* Split m into 24, 52, 52 */ SPLIT_130(t2, t3, t4, t0, t1, t6, t5) @@ -294,8 +292,7 @@ void poly1305_blocks_riscv64(Poly1305* ctx, const unsigned char *m, VMV_S_X(REG_V12, REG_T3) VMV_S_X(REG_V13, REG_T4) /* m1+ nfin */ - "ld t0, 16(%[m])\n\t" - "ld t1, 24(%[m])\n\t" + UNALIGNED_LD2(t0, t1, 16, %[m], t5) /* Split m into 24, 52, 52 */ SPLIT_130(t2, t3, t4, t0, t1, t6, t5) VMV_S_X(REG_V14, REG_T2) @@ -464,10 +461,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) __asm__ __volatile__ ( /* Load key material */ - "ld t0, 0(%[key])\n\t" - "ld t1, 8(%[key])\n\t" - "ld t2, 16(%[key])\n\t" - "ld t3, 24(%[key])\n\t" + UNALIGNED_LD4(t0, t1, t2, t3, 0, %[key], t4) /* Load clamp */ "ld t4, 0(%[clamp])\n\t" "ld t5, 8(%[clamp])\n\t" @@ -636,8 +630,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) "sltu t3, t1, t3\n\t" "add t2, t2, t3\n\t" "andi t2, t2, 3\n\t" - "sd t0, 0(%[mac])\n\t" - "sd t1, 8(%[mac])\n\t" + UNALIGNED_SD2(t0, t1, 0, %[mac], t2) /* Zero out h. */ "sd x0, %[ctx_h_0]\n\t" "sd x0, %[ctx_h_1]\n\t" diff --git a/wolfcrypt/src/port/riscv/riscv-64-sha256.c b/wolfcrypt/src/port/riscv/riscv-64-sha256.c index 44af0b80c3..f672db2739 100644 --- a/wolfcrypt/src/port/riscv/riscv-64-sha256.c +++ b/wolfcrypt/src/port/riscv/riscv-64-sha256.c @@ -484,14 +484,7 @@ static WC_OMIT_FRAME_POINTER WC_INLINE void Sha256Transform(wc_Sha256* sha256, LOAD_DWORD_REV(s6, 48, %[data], a4, a5, a6, a7) LOAD_DWORD_REV(s7, 56, %[data], a4, a5, a6, a7) #else - "lwu a4, 0(%[data])\n\t" - "lwu s0, 4(%[data])\n\t" - "lwu a5, 8(%[data])\n\t" - "lwu s1, 12(%[data])\n\t" - "lwu a6, 16(%[data])\n\t" - "lwu s2, 20(%[data])\n\t" - "lwu a7, 24(%[data])\n\t" - "lwu s3, 28(%[data])\n\t" + UNALIGNED_LWU8(a4, s0, a5, s1, a6, s2, a7, s3, 0, %[data], t4) PACK_BB(s0, s0, a4, REG_S0, REG_S0, REG_A4) PACK_BB(s1, s1, a5, REG_S1, REG_S1, REG_A5) PACK_BB(s2, s2, a6, REG_S2, REG_S2, REG_A6) @@ -500,14 +493,7 @@ static WC_OMIT_FRAME_POINTER WC_INLINE void Sha256Transform(wc_Sha256* sha256, REV8(REG_S1, REG_S1) REV8(REG_S2, REG_S2) REV8(REG_S3, REG_S3) - "lwu a4, 32(%[data])\n\t" - "lwu s4, 36(%[data])\n\t" - "lwu a5, 40(%[data])\n\t" - "lwu s5, 44(%[data])\n\t" - "lwu a6, 48(%[data])\n\t" - "lwu s6, 52(%[data])\n\t" - "lwu a7, 56(%[data])\n\t" - "lwu s7, 60(%[data])\n\t" + UNALIGNED_LWU8(a4, s4, a5, s5, a6, s6, a7, s7, 32, %[data], t4) PACK_BB(s4, s4, a4, REG_S4, REG_S4, REG_A4) PACK_BB(s5, s5, a5, REG_S5, REG_S5, REG_A5) PACK_BB(s6, s6, a6, REG_S6, REG_S6, REG_A6) @@ -840,31 +826,18 @@ static WC_INLINE void Sha256Final(wc_Sha256* sha256, byte* hash) "srli t2, t3, 32\n\t" "srli a4, a5, 32\n\t" "srli a6, a7, 32\n\t" - "sw t0, 0(%[hash])\n\t" - "sw t1, 4(%[hash])\n\t" - "sw t2, 8(%[hash])\n\t" - "sw t3, 12(%[hash])\n\t" - "sw a4, 16(%[hash])\n\t" - "sw a5, 20(%[hash])\n\t" - "sw a6, 24(%[hash])\n\t" - "sw a7, 28(%[hash])\n\t" + UNALIGNED_SW8(t0, t1, t2, t3, a4, a5, a6, a7, 0, %[hash], t4) #else LOAD_WORD_REV(t0, 0, %[digest], t2, t3, t4) LOAD_WORD_REV(t1, 4, %[digest], t2, t3, t4) LOAD_WORD_REV(a4, 8, %[digest], t2, t3, t4) LOAD_WORD_REV(a5, 12, %[digest], t2, t3, t4) - "sw t0, 0(%[hash])\n\t" - "sw t1, 4(%[hash])\n\t" - "sw a4, 8(%[hash])\n\t" - "sw a5, 12(%[hash])\n\t" + UNALIGNED_SW4(t0, t1, a4, a5, 0, %[hash], t2) LOAD_WORD_REV(t0, 16, %[digest], t2, t3, t4) LOAD_WORD_REV(t1, 20, %[digest], t2, t3, t4) LOAD_WORD_REV(a4, 24, %[digest], t2, t3, t4) LOAD_WORD_REV(a5, 28, %[digest], t2, t3, t4) - "sw t0, 16(%[hash])\n\t" - "sw t1, 20(%[hash])\n\t" - "sw a4, 24(%[hash])\n\t" - "sw a5, 28(%[hash])\n\t" + UNALIGNED_SW4(t0, t1, a4, a5, 16, %[hash], t2) #endif : : [digest] "r" (sha256->digest), [hash] "r" (hash) diff --git a/wolfcrypt/src/port/riscv/riscv-64-sha3.c b/wolfcrypt/src/port/riscv/riscv-64-sha3.c index 9d2a959afe..3ecb68d065 100644 --- a/wolfcrypt/src/port/riscv/riscv-64-sha3.c +++ b/wolfcrypt/src/port/riscv/riscv-64-sha3.c @@ -139,7 +139,7 @@ static const word64 hash_keccak_r[24] = #endif -void BlockSha3(word64* s) +WC_OMIT_FRAME_POINTER void BlockSha3(word64* s) { const word64* r = hash_keccak_r; diff --git a/wolfcrypt/src/port/riscv/riscv-64-sha512.c b/wolfcrypt/src/port/riscv/riscv-64-sha512.c index 473b115dc0..8a2fd23e26 100644 --- a/wolfcrypt/src/port/riscv/riscv-64-sha512.c +++ b/wolfcrypt/src/port/riscv/riscv-64-sha512.c @@ -554,14 +554,7 @@ static WC_INLINE void Sha512Transform(wc_Sha512* sha512, const byte* data, LOAD_DWORD_REV(s6, 48, %[data], a4, a5, a6, a7) LOAD_DWORD_REV(s7, 56, %[data], a4, a5, a6, a7) #else - "ld t4, 0(%[data])\n\t" - "ld s1, 8(%[data])\n\t" - "ld s2, 16(%[data])\n\t" - "ld s3, 24(%[data])\n\t" - "ld s4, 32(%[data])\n\t" - "ld s5, 40(%[data])\n\t" - "ld s6, 48(%[data])\n\t" - "ld s7, 56(%[data])\n\t" + UNALIGNED_LD8(t4, s1, s2, s3, s4, s5, s6, s7, 0, %[data], t5) REV8(REG_T4, REG_T4) REV8(REG_S1, REG_S1) REV8(REG_S2, REG_S2) @@ -946,14 +939,7 @@ static WC_INLINE void Sha512Final(wc_Sha512* sha512, byte* hash, int hashLen) REV8(REG_S9, REG_S9) REV8(REG_S10, REG_S10) REV8(REG_S11, REG_S11) - "sd t0, 0(%[hash])\n\t" - "sd t1, 8(%[hash])\n\t" - "sd t2, 16(%[hash])\n\t" - "sd t3, 24(%[hash])\n\t" - "sd s8, 32(%[hash])\n\t" - "sd s9, 40(%[hash])\n\t" - "sd s10, 48(%[hash])\n\t" - "sd s11, 56(%[hash])\n\t" + UNALIGNED_SD8(t0, t1, t2, t3, s8, s9, s10, s11, 0, %[hash], t4) #else LOAD_DWORD_REV(t0, 0, %[digest], a4, a5, a6, a7) LOAD_DWORD_REV(t1, 8, %[digest], a4, a5, a6, a7) @@ -963,14 +949,7 @@ static WC_INLINE void Sha512Final(wc_Sha512* sha512, byte* hash, int hashLen) LOAD_DWORD_REV(s9, 40, %[digest], a4, a5, a6, a7) LOAD_DWORD_REV(s10, 48, %[digest], a4, a5, a6, a7) LOAD_DWORD_REV(s11, 56, %[digest], a4, a5, a6, a7) - "sd t0, 0(%[hash])\n\t" - "sd t1, 8(%[hash])\n\t" - "sd t2, 16(%[hash])\n\t" - "sd t3, 24(%[hash])\n\t" - "sd s8, 32(%[hash])\n\t" - "sd s9, 40(%[hash])\n\t" - "sd s10, 48(%[hash])\n\t" - "sd s11, 56(%[hash])\n\t" + UNALIGNED_SD8(t0, t1, t2, t3, s8, s9, s10, s11, 0, %[hash], t4) #endif : : [digest] "r" (sha512->digest), [hash] "r" (hashRes) diff --git a/wolfssl/wolfcrypt/chacha.h b/wolfssl/wolfcrypt/chacha.h index 5ae021fa43..a2467ba1f9 100644 --- a/wolfssl/wolfcrypt/chacha.h +++ b/wolfssl/wolfcrypt/chacha.h @@ -86,9 +86,10 @@ typedef struct ChaCha { byte extra[12]; #endif word32 left; /* number of bytes leftover */ -#if defined(USE_INTEL_CHACHA_SPEEDUP) || defined(USE_ARM_CHACHA_SPEEDUP) || \ - defined(WOLFSSL_RISCV_ASM) +#if defined(USE_INTEL_CHACHA_SPEEDUP) || defined(USE_ARM_CHACHA_SPEEDUP) word32 over[CHACHA_CHUNK_WORDS]; +#elif defined(WOLFSSL_RISCV_ASM) + ALIGN8 word32 over[CHACHA_CHUNK_WORDS]; #endif } ChaCha; diff --git a/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h b/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h index 70cabff507..11090c65e5 100644 --- a/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h +++ b/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h @@ -181,6 +181,633 @@ /* 32-bit width when loading. */ #define WIDTH_32 0b110 +/* + * Scalar load/store helpers. + * + * Each macro performs the same operation as the ld/lwu/lw/lh/sd/sw/sh + * instruction it is named after. By default it expands to that native + * instruction. When built with WOLFSSL_RISCV_ASM_NO_UNALIGNED - for cores that + * don't support misaligned access - it checks the effective address alignment + * at run time and dispatches to the widest supported sequence: word-wise + * (lwu/sw) when 4-byte aligned, half-wise (lhu/sh) when 2-byte aligned, + * otherwise byte-wise (lbu/sb). The narrower _BY_BYTE / _BY_HALF / _BY_WORD + * forms are also exposed for sites that already know the alignment. Values + * are little-endian, matching the native instructions either way. + * + * Bulk variants UNALIGNED_ (N = 2, 4, 8) issue N + * consecutive accesses starting at o(p) - stride 8 bytes for LD/SD, 4 bytes + * for LWU/LW/SW. Under WOLFSSL_RISCV_ASM_NO_UNALIGNED they share a single + * alignment check across all N elements; otherwise they are just N native + * instructions back-to-back. + * + * r = data register: destination (loads) or source (stores) + * o = constant byte offset + * p = base address register + * t = scratch register - must differ from r and p; clobbered. For stores the + * data register r is preserved. Only used when + * WOLFSSL_RISCV_ASM_NO_UNALIGNED is defined; ignored otherwise. + */ + +/* Apply X to N doublewords at 8-byte stride starting at o(p). */ +#define UNALIGNED_DW_REP2(X, r0, r1, o, p, t) \ + X(r0, o, p, t) \ + X(r1, o+8, p, t) +#define UNALIGNED_DW_REP4(X, r0, r1, r2, r3, o, p, t) \ + X(r0, o, p, t) \ + X(r1, o+8, p, t) \ + X(r2, o+16, p, t) \ + X(r3, o+24, p, t) +#define UNALIGNED_DW_REP8(X, r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + X(r0, o, p, t) \ + X(r1, o+8, p, t) \ + X(r2, o+16, p, t) \ + X(r3, o+24, p, t) \ + X(r4, o+32, p, t) \ + X(r5, o+40, p, t) \ + X(r6, o+48, p, t) \ + X(r7, o+56, p, t) + +/* Apply X to N words at 4-byte stride starting at o(p). */ +#define UNALIGNED_W_REP2(X, r0, r1, o, p, t) \ + X(r0, o, p, t) \ + X(r1, o+4, p, t) +#define UNALIGNED_W_REP4(X, r0, r1, r2, r3, o, p, t) \ + X(r0, o, p, t) \ + X(r1, o+4, p, t) \ + X(r2, o+8, p, t) \ + X(r3, o+12, p, t) +#define UNALIGNED_W_REP8(X, r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + X(r0, o, p, t) \ + X(r1, o+4, p, t) \ + X(r2, o+8, p, t) \ + X(r3, o+12, p, t) \ + X(r4, o+16, p, t) \ + X(r5, o+20, p, t) \ + X(r6, o+24, p, t) \ + X(r7, o+28, p, t) + +#ifndef WOLFSSL_RISCV_ASM_NO_UNALIGNED + +/* Load 64-bits. */ +#define UNALIGNED_LD(r, o, p, t) \ + "ld " #r ", " #o "(" #p ")\n\t" + +/* Load 32-bits, zero extended. */ +#define UNALIGNED_LWU(r, o, p, t) \ + "lwu " #r ", " #o "(" #p ")\n\t" + +/* Load 32-bits, sign extended. */ +#define UNALIGNED_LW(r, o, p, t) \ + "lw " #r ", " #o "(" #p ")\n\t" + +/* Load 16-bits, sign extended. */ +#define UNALIGNED_LH(r, o, p, t) \ + "lh " #r ", " #o "(" #p ")\n\t" + +/* Store 64-bits. */ +#define UNALIGNED_SD(r, o, p, t) \ + "sd " #r ", " #o "(" #p ")\n\t" + +/* Store 32-bits. */ +#define UNALIGNED_SW(r, o, p, t) \ + "sw " #r ", " #o "(" #p ")\n\t" + +/* Store 16-bits. */ +#define UNALIGNED_SH(r, o, p, t) \ + "sh " #r ", " #o "(" #p ")\n\t" + +/* Bulk variants - hardware handles unaligned access, so just emit N native + * instructions. */ +#define UNALIGNED_LD2(r0, r1, o, p, t) \ + UNALIGNED_DW_REP2(UNALIGNED_LD, r0, r1, o, p, t) +#define UNALIGNED_LD4(r0, r1, r2, r3, o, p, t) \ + UNALIGNED_DW_REP4(UNALIGNED_LD, r0, r1, r2, r3, o, p, t) +#define UNALIGNED_LD8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + UNALIGNED_DW_REP8(UNALIGNED_LD, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) +#define UNALIGNED_SD2(r0, r1, o, p, t) \ + UNALIGNED_DW_REP2(UNALIGNED_SD, r0, r1, o, p, t) +#define UNALIGNED_SD4(r0, r1, r2, r3, o, p, t) \ + UNALIGNED_DW_REP4(UNALIGNED_SD, r0, r1, r2, r3, o, p, t) +#define UNALIGNED_SD8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + UNALIGNED_DW_REP8(UNALIGNED_SD, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) +#define UNALIGNED_LWU2(r0, r1, o, p, t) \ + UNALIGNED_W_REP2(UNALIGNED_LWU, r0, r1, o, p, t) +#define UNALIGNED_LWU4(r0, r1, r2, r3, o, p, t) \ + UNALIGNED_W_REP4(UNALIGNED_LWU, r0, r1, r2, r3, o, p, t) +#define UNALIGNED_LWU8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + UNALIGNED_W_REP8(UNALIGNED_LWU, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) +#define UNALIGNED_LW2(r0, r1, o, p, t) \ + UNALIGNED_W_REP2(UNALIGNED_LW, r0, r1, o, p, t) +#define UNALIGNED_LW4(r0, r1, r2, r3, o, p, t) \ + UNALIGNED_W_REP4(UNALIGNED_LW, r0, r1, r2, r3, o, p, t) +#define UNALIGNED_LW8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + UNALIGNED_W_REP8(UNALIGNED_LW, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) +#define UNALIGNED_SW2(r0, r1, o, p, t) \ + UNALIGNED_W_REP2(UNALIGNED_SW, r0, r1, o, p, t) +#define UNALIGNED_SW4(r0, r1, r2, r3, o, p, t) \ + UNALIGNED_W_REP4(UNALIGNED_SW, r0, r1, r2, r3, o, p, t) +#define UNALIGNED_SW8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + UNALIGNED_W_REP8(UNALIGNED_SW, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) + +#else + +/* Load 64-bits. */ +#define UNALIGNED_LD_BY_BYTE(r, o, p, t) \ + "lbu " #r ", " #o "+0(" #p ")\n\t" \ + "lbu " #t ", " #o "+1(" #p ")\n\t" \ + "slli " #t ", " #t ", 8\n\t" \ + "or " #r ", " #r ", " #t "\n\t" \ + "lbu " #t ", " #o "+2(" #p ")\n\t" \ + "slli " #t ", " #t ", 16\n\t" \ + "or " #r ", " #r ", " #t "\n\t" \ + "lbu " #t ", " #o "+3(" #p ")\n\t" \ + "slli " #t ", " #t ", 24\n\t" \ + "or " #r ", " #r ", " #t "\n\t" \ + "lbu " #t ", " #o "+4(" #p ")\n\t" \ + "slli " #t ", " #t ", 32\n\t" \ + "or " #r ", " #r ", " #t "\n\t" \ + "lbu " #t ", " #o "+5(" #p ")\n\t" \ + "slli " #t ", " #t ", 40\n\t" \ + "or " #r ", " #r ", " #t "\n\t" \ + "lbu " #t ", " #o "+6(" #p ")\n\t" \ + "slli " #t ", " #t ", 48\n\t" \ + "or " #r ", " #r ", " #t "\n\t" \ + "lbu " #t ", " #o "+7(" #p ")\n\t" \ + "slli " #t ", " #t ", 56\n\t" \ + "or " #r ", " #r ", " #t "\n\t" +#define UNALIGNED_LD_BY_HALF(r, o, p, t) \ + "lhu " #r ", " #o "+0(" #p ")\n\t" \ + "lhu " #t ", " #o "+2(" #p ")\n\t" \ + "slli " #t ", " #t ", 16\n\t" \ + "or " #r ", " #r ", " #t "\n\t" \ + "lhu " #t ", " #o "+4(" #p ")\n\t" \ + "slli " #t ", " #t ", 32\n\t" \ + "or " #r ", " #r ", " #t "\n\t" \ + "lhu " #t ", " #o "+6(" #p ")\n\t" \ + "slli " #t ", " #t ", 48\n\t" \ + "or " #r ", " #r ", " #t "\n\t" +#define UNALIGNED_LD_BY_WORD(r, o, p, t) \ + "lwu " #r ", " #o "+0(" #p ")\n\t" \ + "lwu " #t ", " #o "+4(" #p ")\n\t" \ + "slli " #t ", " #t ", 32\n\t" \ + "or " #r ", " #r ", " #t "\n\t" +#define UNALIGNED_LD_BY_DWORD(r, o, p, t) \ + "ld " #r ", " #o "(" #p ")\n\t" +/* Assumes o is a multiple of 8. */ +#define UNALIGNED_LD(r, o, p, t) \ + "andi " #t ", " #p ", 7\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_LD_BY_DWORD(r, o, p, t) \ + "j 4f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 3\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_LD_BY_WORD(r, o, p, t) \ + "j 4f\n\t" \ + "2:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 3f\n\t" \ + UNALIGNED_LD_BY_HALF(r, o, p, t) \ + "j 4f\n\t" \ + "3:\n\t" \ + UNALIGNED_LD_BY_BYTE(r, o, p, t) \ + "4:\n\t" + +/* Load 32-bits, zero extended. */ +#define UNALIGNED_LWU_BY_BYTE(r, o, p, t) \ + "lbu " #r ", " #o "+0(" #p ")\n\t" \ + "lbu " #t ", " #o "+1(" #p ")\n\t" \ + "slli " #t ", " #t ", 8\n\t" \ + "or " #r ", " #r ", " #t "\n\t" \ + "lbu " #t ", " #o "+2(" #p ")\n\t" \ + "slli " #t ", " #t ", 16\n\t" \ + "or " #r ", " #r ", " #t "\n\t" \ + "lbu " #t ", " #o "+3(" #p ")\n\t" \ + "slli " #t ", " #t ", 24\n\t" \ + "or " #r ", " #r ", " #t "\n\t" +#define UNALIGNED_LWU_BY_HALF(r, o, p, t) \ + "lhu " #r ", " #o "+0(" #p ")\n\t" \ + "lhu " #t ", " #o "+2(" #p ")\n\t" \ + "slli " #t ", " #t ", 16\n\t" \ + "or " #r ", " #r ", " #t "\n\t" +#define UNALIGNED_LWU_BY_WORD(r, o, p, t) \ + "lwu " #r ", " #o "(" #p ")\n\t" +/* Assumes o is a multiple of 4. */ +#define UNALIGNED_LWU(r, o, p, t) \ + "andi " #t ", " #p ", 3\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_LWU_BY_WORD(r, o, p, t) \ + "j 3f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_LWU_BY_HALF(r, o, p, t) \ + "j 3f\n\t" \ + "2:\n\t" \ + UNALIGNED_LWU_BY_BYTE(r, o, p, t) \ + "3:\n\t" + +/* Load 32-bits, sign extended. */ +#define UNALIGNED_LW_BY_BYTE(r, o, p, t) \ + UNALIGNED_LWU_BY_BYTE(r, o, p, t) \ + "sext.w " #r ", " #r "\n\t" +#define UNALIGNED_LW_BY_HALF(r, o, p, t) \ + UNALIGNED_LWU_BY_HALF(r, o, p, t) \ + "sext.w " #r ", " #r "\n\t" +#define UNALIGNED_LW_BY_WORD(r, o, p, t) \ + "lw " #r ", " #o "(" #p ")\n\t" +/* Assumes o is a multiple of 4. */ +#define UNALIGNED_LW(r, o, p, t) \ + "andi " #t ", " #p ", 3\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_LW_BY_WORD(r, o, p, t) \ + "j 3f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_LW_BY_HALF(r, o, p, t) \ + "j 3f\n\t" \ + "2:\n\t" \ + UNALIGNED_LW_BY_BYTE(r, o, p, t) \ + "3:\n\t" + +/* Load 16-bits, sign extended. */ +#define UNALIGNED_LH_BY_BYTE(r, o, p, t) \ + "lbu " #r ", " #o "+0(" #p ")\n\t" \ + "lb " #t ", " #o "+1(" #p ")\n\t" \ + "slli " #t ", " #t ", 8\n\t" \ + "or " #r ", " #r ", " #t "\n\t" +#define UNALIGNED_LH(r, o, p, t) \ + UNALIGNED_LH_BY_BYTE(r, o, p, t) + +/* Store 64-bits. */ +#define UNALIGNED_SD_BY_BYTE(r, o, p, t) \ + "sb " #r ", " #o "+0(" #p ")\n\t" \ + "srli " #t ", " #r ", 8\n\t" \ + "sb " #t ", " #o "+1(" #p ")\n\t" \ + "srli " #t ", " #r ", 16\n\t" \ + "sb " #t ", " #o "+2(" #p ")\n\t" \ + "srli " #t ", " #r ", 24\n\t" \ + "sb " #t ", " #o "+3(" #p ")\n\t" \ + "srli " #t ", " #r ", 32\n\t" \ + "sb " #t ", " #o "+4(" #p ")\n\t" \ + "srli " #t ", " #r ", 40\n\t" \ + "sb " #t ", " #o "+5(" #p ")\n\t" \ + "srli " #t ", " #r ", 48\n\t" \ + "sb " #t ", " #o "+6(" #p ")\n\t" \ + "srli " #t ", " #r ", 56\n\t" \ + "sb " #t ", " #o "+7(" #p ")\n\t" +#define UNALIGNED_SD_BY_HALF(r, o, p, t) \ + "sh " #r ", " #o "+0(" #p ")\n\t" \ + "srli " #t ", " #r ", 16\n\t" \ + "sh " #t ", " #o "+2(" #p ")\n\t" \ + "srli " #t ", " #r ", 32\n\t" \ + "sh " #t ", " #o "+4(" #p ")\n\t" \ + "srli " #t ", " #r ", 48\n\t" \ + "sh " #t ", " #o "+6(" #p ")\n\t" +#define UNALIGNED_SD_BY_WORD(r, o, p, t) \ + "sw " #r ", " #o "+0(" #p ")\n\t" \ + "srli " #t ", " #r ", 32\n\t" \ + "sw " #t ", " #o "+4(" #p ")\n\t" +#define UNALIGNED_SD_BY_DWORD(r, o, p, t) \ + "sd " #r ", " #o "(" #p ")\n\t" +/* Assumes o is a multiple of 8. */ +#define UNALIGNED_SD(r, o, p, t) \ + "andi " #t ", " #p ", 7\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_SD_BY_DWORD(r, o, p, t) \ + "j 4f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 3\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_SD_BY_WORD(r, o, p, t) \ + "j 4f\n\t" \ + "2:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 3f\n\t" \ + UNALIGNED_SD_BY_HALF(r, o, p, t) \ + "j 4f\n\t" \ + "3:\n\t" \ + UNALIGNED_SD_BY_BYTE(r, o, p, t) \ + "4:\n\t" + +/* Store 32-bits. */ +#define UNALIGNED_SW_BY_BYTE(r, o, p, t) \ + "sb " #r ", " #o "+0(" #p ")\n\t" \ + "srli " #t ", " #r ", 8\n\t" \ + "sb " #t ", " #o "+1(" #p ")\n\t" \ + "srli " #t ", " #r ", 16\n\t" \ + "sb " #t ", " #o "+2(" #p ")\n\t" \ + "srli " #t ", " #r ", 24\n\t" \ + "sb " #t ", " #o "+3(" #p ")\n\t" +#define UNALIGNED_SW_BY_HALF(r, o, p, t) \ + "sh " #r ", " #o "+0(" #p ")\n\t" \ + "srli " #t ", " #r ", 16\n\t" \ + "sh " #t ", " #o "+2(" #p ")\n\t" +#define UNALIGNED_SW_BY_WORD(r, o, p, t) \ + "sw " #r ", " #o "(" #p ")\n\t" +/* Assumes o is a multiple of 4. */ +#define UNALIGNED_SW(r, o, p, t) \ + "andi " #t ", " #p ", 3\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_SW_BY_WORD(r, o, p, t) \ + "j 3f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_SW_BY_HALF(r, o, p, t) \ + "j 3f\n\t" \ + "2:\n\t" \ + UNALIGNED_SW_BY_BYTE(r, o, p, t) \ + "3:\n\t" + +/* Store 16-bits. */ +#define UNALIGNED_SH_BY_BYTE(r, o, p, t) \ + "sb " #r ", " #o "+0(" #p ")\n\t" \ + "srli " #t ", " #r ", 8\n\t" \ + "sb " #t ", " #o "+1(" #p ")\n\t" +#define UNALIGNED_SH(r, o, p, t) \ + UNALIGNED_SH_BY_BYTE(r, o, p, t) + +/* Load 2 64-bits. Assumes o is a multiple of 8. */ +#define UNALIGNED_LD2(r0, r1, o, p, t) \ + "andi " #t ", " #p ", 7\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_DW_REP2(UNALIGNED_LD_BY_DWORD, r0, r1, o, p, t) \ + "j 4f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 3\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_DW_REP2(UNALIGNED_LD_BY_WORD, r0, r1, o, p, t) \ + "j 4f\n\t" \ + "2:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 3f\n\t" \ + UNALIGNED_DW_REP2(UNALIGNED_LD_BY_HALF, r0, r1, o, p, t) \ + "j 4f\n\t" \ + "3:\n\t" \ + UNALIGNED_DW_REP2(UNALIGNED_LD_BY_BYTE, r0, r1, o, p, t) \ + "4:\n\t" + +/* Load 4 64-bits. Assumes o is a multiple of 8. */ +#define UNALIGNED_LD4(r0, r1, r2, r3, o, p, t) \ + "andi " #t ", " #p ", 7\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_DW_REP4(UNALIGNED_LD_BY_DWORD, r0, r1, r2, r3, o, p, t) \ + "j 4f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 3\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_DW_REP4(UNALIGNED_LD_BY_WORD, r0, r1, r2, r3, o, p, t) \ + "j 4f\n\t" \ + "2:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 3f\n\t" \ + UNALIGNED_DW_REP4(UNALIGNED_LD_BY_HALF, r0, r1, r2, r3, o, p, t) \ + "j 4f\n\t" \ + "3:\n\t" \ + UNALIGNED_DW_REP4(UNALIGNED_LD_BY_BYTE, r0, r1, r2, r3, o, p, t) \ + "4:\n\t" + +/* Load 8 64-bits. Assumes o is a multiple of 8. */ +#define UNALIGNED_LD8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "andi " #t ", " #p ", 7\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_DW_REP8(UNALIGNED_LD_BY_DWORD, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "j 4f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 3\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_DW_REP8(UNALIGNED_LD_BY_WORD, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "j 4f\n\t" \ + "2:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 3f\n\t" \ + UNALIGNED_DW_REP8(UNALIGNED_LD_BY_HALF, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "j 4f\n\t" \ + "3:\n\t" \ + UNALIGNED_DW_REP8(UNALIGNED_LD_BY_BYTE, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "4:\n\t" + +/* Store 2 64-bits. Assumes o is a multiple of 8. */ +#define UNALIGNED_SD2(r0, r1, o, p, t) \ + "andi " #t ", " #p ", 7\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_DW_REP2(UNALIGNED_SD_BY_DWORD, r0, r1, o, p, t) \ + "j 4f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 3\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_DW_REP2(UNALIGNED_SD_BY_WORD, r0, r1, o, p, t) \ + "j 4f\n\t" \ + "2:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 3f\n\t" \ + UNALIGNED_DW_REP2(UNALIGNED_SD_BY_HALF, r0, r1, o, p, t) \ + "j 4f\n\t" \ + "3:\n\t" \ + UNALIGNED_DW_REP2(UNALIGNED_SD_BY_BYTE, r0, r1, o, p, t) \ + "4:\n\t" + +/* Store 4 64-bits. Assumes o is a multiple of 8. */ +#define UNALIGNED_SD4(r0, r1, r2, r3, o, p, t) \ + "andi " #t ", " #p ", 7\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_DW_REP4(UNALIGNED_SD_BY_DWORD, r0, r1, r2, r3, o, p, t) \ + "j 4f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 3\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_DW_REP4(UNALIGNED_SD_BY_WORD, r0, r1, r2, r3, o, p, t) \ + "j 4f\n\t" \ + "2:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 3f\n\t" \ + UNALIGNED_DW_REP4(UNALIGNED_SD_BY_HALF, r0, r1, r2, r3, o, p, t) \ + "j 4f\n\t" \ + "3:\n\t" \ + UNALIGNED_DW_REP4(UNALIGNED_SD_BY_BYTE, r0, r1, r2, r3, o, p, t) \ + "4:\n\t" + +/* Store 8 64-bits. Assumes o is a multiple of 8. */ +#define UNALIGNED_SD8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "andi " #t ", " #p ", 7\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_DW_REP8(UNALIGNED_SD_BY_DWORD, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "j 4f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 3\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_DW_REP8(UNALIGNED_SD_BY_WORD, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "j 4f\n\t" \ + "2:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 3f\n\t" \ + UNALIGNED_DW_REP8(UNALIGNED_SD_BY_HALF, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "j 4f\n\t" \ + "3:\n\t" \ + UNALIGNED_DW_REP8(UNALIGNED_SD_BY_BYTE, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "4:\n\t" + +/* Load 2 32-bits, zero extended. Assumes o is a multiple of 4. */ +#define UNALIGNED_LWU2(r0, r1, o, p, t) \ + "andi " #t ", " #p ", 3\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_W_REP2(UNALIGNED_LWU_BY_WORD, r0, r1, o, p, t) \ + "j 3f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_W_REP2(UNALIGNED_LWU_BY_HALF, r0, r1, o, p, t) \ + "j 3f\n\t" \ + "2:\n\t" \ + UNALIGNED_W_REP2(UNALIGNED_LWU_BY_BYTE, r0, r1, o, p, t) \ + "3:\n\t" + +/* Load 4 32-bits, zero extended. Assumes o is a multiple of 4. */ +#define UNALIGNED_LWU4(r0, r1, r2, r3, o, p, t) \ + "andi " #t ", " #p ", 3\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_W_REP4(UNALIGNED_LWU_BY_WORD, r0, r1, r2, r3, o, p, t) \ + "j 3f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_W_REP4(UNALIGNED_LWU_BY_HALF, r0, r1, r2, r3, o, p, t) \ + "j 3f\n\t" \ + "2:\n\t" \ + UNALIGNED_W_REP4(UNALIGNED_LWU_BY_BYTE, r0, r1, r2, r3, o, p, t) \ + "3:\n\t" + +/* Load 8 32-bits, zero extended. Assumes o is a multiple of 4. */ +#define UNALIGNED_LWU8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "andi " #t ", " #p ", 3\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_W_REP8(UNALIGNED_LWU_BY_WORD, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "j 3f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_W_REP8(UNALIGNED_LWU_BY_HALF, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "j 3f\n\t" \ + "2:\n\t" \ + UNALIGNED_W_REP8(UNALIGNED_LWU_BY_BYTE, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "3:\n\t" + +/* Load 2 32-bits, sign extended. Assumes o is a multiple of 4. */ +#define UNALIGNED_LW2(r0, r1, o, p, t) \ + "andi " #t ", " #p ", 3\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_W_REP2(UNALIGNED_LW_BY_WORD, r0, r1, o, p, t) \ + "j 3f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_W_REP2(UNALIGNED_LW_BY_HALF, r0, r1, o, p, t) \ + "j 3f\n\t" \ + "2:\n\t" \ + UNALIGNED_W_REP2(UNALIGNED_LW_BY_BYTE, r0, r1, o, p, t) \ + "3:\n\t" + +/* Load 4 32-bits, sign extended. Assumes o is a multiple of 4. */ +#define UNALIGNED_LW4(r0, r1, r2, r3, o, p, t) \ + "andi " #t ", " #p ", 3\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_W_REP4(UNALIGNED_LW_BY_WORD, r0, r1, r2, r3, o, p, t) \ + "j 3f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_W_REP4(UNALIGNED_LW_BY_HALF, r0, r1, r2, r3, o, p, t) \ + "j 3f\n\t" \ + "2:\n\t" \ + UNALIGNED_W_REP4(UNALIGNED_LW_BY_BYTE, r0, r1, r2, r3, o, p, t) \ + "3:\n\t" + +/* Load 8 32-bits, sign extended. Assumes o is a multiple of 4. */ +#define UNALIGNED_LW8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "andi " #t ", " #p ", 3\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_W_REP8(UNALIGNED_LW_BY_WORD, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "j 3f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_W_REP8(UNALIGNED_LW_BY_HALF, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "j 3f\n\t" \ + "2:\n\t" \ + UNALIGNED_W_REP8(UNALIGNED_LW_BY_BYTE, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "3:\n\t" + +/* Store 2 32-bits. Assumes o is a multiple of 4. */ +#define UNALIGNED_SW2(r0, r1, o, p, t) \ + "andi " #t ", " #p ", 3\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_W_REP2(UNALIGNED_SW_BY_WORD, r0, r1, o, p, t) \ + "j 3f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_W_REP2(UNALIGNED_SW_BY_HALF, r0, r1, o, p, t) \ + "j 3f\n\t" \ + "2:\n\t" \ + UNALIGNED_W_REP2(UNALIGNED_SW_BY_BYTE, r0, r1, o, p, t) \ + "3:\n\t" + +/* Store 4 32-bits. Assumes o is a multiple of 4. */ +#define UNALIGNED_SW4(r0, r1, r2, r3, o, p, t) \ + "andi " #t ", " #p ", 3\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_W_REP4(UNALIGNED_SW_BY_WORD, r0, r1, r2, r3, o, p, t) \ + "j 3f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_W_REP4(UNALIGNED_SW_BY_HALF, r0, r1, r2, r3, o, p, t) \ + "j 3f\n\t" \ + "2:\n\t" \ + UNALIGNED_W_REP4(UNALIGNED_SW_BY_BYTE, r0, r1, r2, r3, o, p, t) \ + "3:\n\t" + +/* Store 8 32-bits. Assumes o is a multiple of 4. */ +#define UNALIGNED_SW8(r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "andi " #t ", " #p ", 3\n\t" \ + "bnez " #t ", 1f\n\t" \ + UNALIGNED_W_REP8(UNALIGNED_SW_BY_WORD, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "j 3f\n\t" \ + "1:\n\t" \ + "andi " #t ", " #t ", 1\n\t" \ + "bnez " #t ", 2f\n\t" \ + UNALIGNED_W_REP8(UNALIGNED_SW_BY_HALF, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "j 3f\n\t" \ + "2:\n\t" \ + UNALIGNED_W_REP8(UNALIGNED_SW_BY_BYTE, \ + r0, r1, r2, r3, r4, r5, r6, r7, o, p, t) \ + "3:\n\t" + +#endif /* !WOLFSSL_RISCV_ASM_NO_UNALIGNED */ + #define VLSEG_V(vd, rs1, cnt, width) \ ASM_WORD(0b0000111 | (width << 12) | (0b10101000 << 20) | \